<a href="https://colab.research.google.com/github/masalha-alaa/gender-prediction/blob/master/gender_recognition_with_sentiment_bert_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install import-ipynb -q
!pip install transformers datasets -q
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
import sys
import re
from pathlib import Path
from datetime import datetime
from google.colab import drive
from json import load as j_load, loads as j_loads
from collections import defaultdict
from multiprocessing import cpu_count, Pool

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import emoji

import torch
from datasets import DatasetDict, load_dataset

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('vader_lexicon')
from nltk import word_tokenize
from nltk import data as nltk_data, pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer

import import_ipynb
from tqdm.notebook import tqdm
tqdm.pandas()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
SEED = 42
NGRAM_LEN = (1)
SENTENCES_N = 4
MOST_COMMON = 1150

In [4]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [5]:
# Dataframe columns
TEXT = 'txt'
URL = 'url'
AUTHOR = 'author'
LABEL = GENDER = 'gender'

In [6]:
def get_emoji_regexp():
    # https://github.com/carpedm20/emoji/issues/222#issuecomment-1200303280
    # Sort emoji by length to make sure multi-character emojis are
    # matched first
    emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
    pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
    return re.compile(pattern)


def clean_line_parallel(params):
    part, emoticons_ = params
    return part.apply(lambda row: clean_line(row, emoticons_))


def clean_line(line, emos=None):
    if not line:
        return ''
    EMOJI = r'emj'
    URL = r'url'
    PUNCT = r'pnc'
    remove = ['your comment has been removed', 'this comment or post has been removed', 'removed ', 'removed. ',
              'your submission has been remove', 'this post has been removed', 'your post has been removed']
    line = line.lower()
    if any(re.match(r, line) for r in remove):
        return ''
    line = get_emoji_regexp().sub(EMOJI, line)
    if emos is None:
        emoticons_ = emoticons
    else:
        emoticons_ = emos
    for emoticon in emoticons_:
        line = line.replace(emoticon, EMOJI)
    line = re.sub(r'\(?http\S+\)?', URL, line)
    return line


def posify(txt):
    """
    Converts txt to part of speech tags
    """
    return ' '.join([pair[1] for pair in pos_tag(txt.split())])


def posifyNew(txt):
    """
    Converts txt to pairs of word & POS
    """
    return ' '.join([' '.join(pos) for pos in pos_tag(txt.split())])


def posifyNew2(txt):
    """
    Appends pos text to txt
    """
    return txt + ' ' + ' '.join([pos[1] for pos in pos_tag(txt.split())])


def sentecize(txt):
    return tokenizer.tokenize(txt)


# sentiment analysis per sentence
def createSentiments(ser, sia):
    sentiments = ser.apply(lambda row: {k:v for k,v in sia.polarity_scores(row).items() if k != 'compound'})
    return sentiments


def aggregate(df, sentences_n=SENTENCES_N):
    df_agg = df.groupby(df.index // sentences_n).agg({'txt': ' '.join, 'pos': ' '.join, 'sentiment': ' '.join, 'txt_with_sentiments': ' '.join, 'gender': 'first'}).reset_index(drop=True).copy()
    return df_agg

In [7]:
EMOJI = r'emj'
URL = r'url'
MALE, FEMALE = 0, 1
lbl_to_id = {"male": MALE, "female": FEMALE}
id_to_lbl = {i: lbl for lbl,i in lbl_to_id.items()}

In [8]:
!date

ts = datetime.now()
print('Program started')

PRECUT = 0.30
CLASS_SIZE = None  # None to take min max
SELECT_K_BEST = 100
TRAIN_FRAC = 0.70
MIN_SENT_LEN = 5  # in tokens  # TODO: Do some trials

DRIVE_ROOT_DATA_DIR = Path("/content/gdrive/MyDrive/gender-project/")
DATASET_PATH = DRIVE_ROOT_DATA_DIR / "dataset3 and 2021-11-06 19-51-29"
RAW_DATASET_PATH = DATASET_PATH / "raw"
CLEAN_DATASET_PATH = DATASET_PATH / "clean"
EXTRA_DIR = DRIVE_ROOT_DATA_DIR / "extra"
EMOTICONS_LST_FILE = "emoticons.txt"
FUNCTION_WORDS_LST_FILE = "fw.txt"

# read dataset
if not os.path.exists(CLEAN_DATASET_PATH / "train.parquet"):
    data = pd.read_csv(RAW_DATASET_PATH / "data.csv")
    if 0 < PRECUT < 1:
        data = data.sample(frac=PRECUT, random_state=SEED)

    data = data.drop(data[(data.gender != 'male') & (data.gender != 'female')].index).reset_index(drop=True)

    # Clean
    clean_ts = datetime.now()
    print('Cleaning...', end=' ')
    with open(EXTRA_DIR / EMOTICONS_LST_FILE) as f:
        emoticons = [emoticon.lower() for emoticon in list(set(f.read().split()))]
    if MIN_SENT_LEN > 1:
        # TODO: Ditch short before or after exploding?!
        data[data.txt.str.split().apply(len) < MIN_SENT_LEN] = ''
        # data = data[data.txt.str.split().apply(len) >= MIN_SENT_LEN].reset_index(drop=True)

    # PARALLEL BEGIN
    pools = cpu_count()
    with Pool(pools) as pool:
        groups = [(part, emoticons) for part in np.array_split(data['txt'], pools)]
        data['txt'] = pd.concat(pool.map(clean_line_parallel, groups), axis=0)
    # PARALLEL END

    data = data.replace('', np.nan).dropna().reset_index(drop=True)
    print(datetime.now() - clean_ts)

    # split to genders
    print('Splitting to genders...')
    male = data.loc[data['gender'] == 'male', ['txt']]
    female = data.loc[data['gender'] == 'female', ['txt']]

    # split to sentences
    print('Splitting to sentences...')
    tokenizer = nltk_data.load('tokenizers/punkt/english.pickle')
    male = male.apply(lambda row: sentecize(row['txt']), axis=1).explode('txt').apply(lambda row: row.strip()).replace('', np.nan).dropna().to_frame('txt').copy().reset_index(drop=True)
    female = female.apply(lambda row: sentecize(row['txt']), axis=1).explode('txt').apply(lambda row: row.strip()).replace('', np.nan).dropna().to_frame('txt').copy().reset_index(drop=True)

    print('Sentiment analysis...', end=' ')
    sia_ts = datetime.now()
    sia = SentimentIntensityAnalyzer()
    sentiments = createSentiments(male['txt'], sia)
    male['sentiment'] = male.apply(lambda row: max(sentiments[row.name], key=sentiments[row.name].get), axis=1)
    male['txt_with_sentiments'] = male.apply(lambda row: row['txt'] + ' ' + max(sentiments[row.name], key=sentiments[row.name].get), axis=1)
    sentiments = createSentiments(female['txt'], sia)
    female['sentiment'] = female.apply(lambda row: max(sentiments[row.name], key=sentiments[row.name].get), axis=1)
    female['txt_with_sentiments'] = female.apply(lambda row: row['txt'] + ' ' + max(sentiments[row.name], key=sentiments[row.name].get), axis=1)
    print(datetime.now() - sia_ts)
    print(f"Example: {male['txt_with_sentiments'][0]}")

    # posify
    print('Posifying...', end=' ')
    ts_pos = datetime.now()
    with Pool(cpu_count()) as pool:
        # only pos
        # male['pos'] = pool.map(posify, male['txt'])
        # female['pos'] = pool.map(posify, female['txt'])
        # txt and pos together
        male['pos'] = pool.map(posifyNew, male['txt'])
        female['pos'] = pool.map(posifyNew, female['txt'])
    print(datetime.now() - ts_pos)

    # add labels
    print('Adding labels...')
    male['gender'] = [id_to_lbl[MALE]] * len(male)
    female['gender'] = [id_to_lbl[FEMALE]] * len(female)

    # sample randomly and aggregate
    select_n = CLASS_SIZE if CLASS_SIZE and CLASS_SIZE <= min(len(male), len(female)) else min(len(male), len(female))
    print(f'Class size: {select_n}')
    print('Shuffling and aggregating...')
    male = male.sample(n=select_n, random_state=SEED).reset_index(drop=True)
    female = female.sample(n=select_n, random_state=SEED).reset_index(drop=True)
    male = aggregate(male)
    female = aggregate(female)
    print(f'Class size (M,F): {len(male)}, {len(female)}')

    # merge and shuffle
    print('Merging and shuffling...')
    data = pd.concat([male, female]).reset_index(drop=True).sample(frac=1.0, random_state=SEED).reset_index(drop=True)

    # split to train / test
    print('Splitting to train / test...')
    test = data.iloc[int(len(data) * TRAIN_FRAC):].reset_index(drop=True)
    train = data.iloc[:int(len(data) * TRAIN_FRAC)]
    print(f'Train size: {len(train)}')
    print(f'Test size: {len(test)}')
    display(train.head(1))
    display(pd.concat((train[LABEL].value_counts().rename("train"), test[LABEL].value_counts().rename("test")), axis=1))

    if not os.path.exists(CLEAN_DATASET_PATH):
        os.makedirs(CLEAN_DATASET_PATH)

    train.to_json(CLEAN_DATASET_PATH / "train.json")
    train.to_parquet(CLEAN_DATASET_PATH / "train.parquet")

    test.to_json(CLEAN_DATASET_PATH / "validation.json")
    test.to_parquet(CLEAN_DATASET_PATH / "validation.parquet")

!date

Mon 27 Mar 2023 01:27:40 PM UTC
Program started
Mon 27 Mar 2023 01:27:40 PM UTC


In [9]:
try:
    del train
    del test
except NameError:
    pass

In [10]:
temp = pd.read_parquet(CLEAN_DATASET_PATH / "train.parquet")
columns = temp.columns.tolist()
display(temp.head(1))
del temp
dataset = load_dataset("parquet",
                    data_files={"train": str(CLEAN_DATASET_PATH / "train.parquet"),
                                "validation": str(CLEAN_DATASET_PATH / "validation.parquet")})
dataset

Unnamed: 0,txt,pos,sentiment,txt_with_sentiments,gender
0,people really don't understand this. the only ...,people NNS really RB don't JJ understand NN th...,neu neu neu neu,people really don't understand this. neu the o...,male




  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['txt', 'pos', 'sentiment', 'txt_with_sentiments', 'gender'],
        num_rows: 15910
    })
    validation: Dataset({
        features: ['txt', 'pos', 'sentiment', 'txt_with_sentiments', 'gender'],
        num_rows: 6820
    })
})

In [11]:
display(pd.Series(dataset["train"][LABEL]).rename("train").value_counts(dropna=False))
print("")
display(pd.Series(dataset["validation"][LABEL]).rename("test").value_counts(dropna=False))

male      7970
female    7940
Name: train, dtype: int64




female    3425
male      3395
Name: test, dtype: int64

In [12]:
from multiprocessing import cpu_count

BATCH_SIZE = 16
EPOCHS = 4
model_path = "bert-base-uncased"
# model_path = "Fan-s/reddit-tc-bert"
num_labels = len(lbl_to_id)
num_workers = cpu_count()
print(f"{num_workers = }")

num_workers = 2


In [13]:
# Use GPU if available

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Running on {device}')

Running on cuda


In [14]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from collections import Counter


tokenizer = AutoTokenizer.from_pretrained(model_path)
possible_sent = ['neu', 'pos', 'neg']

def tokenize_function(examples):
    encoded = tokenizer(examples["txt"],
                        padding="max_length",
                        truncation=True,
                        return_tensors="pt")
    sentiments = np.zeros((len(examples['sentiment']), len(possible_sent)))
    for i, sentiment_s in enumerate(examples['sentiment']):
        cnt = Counter(sentiment_s.split())
        for j, k in enumerate(possible_sent):
            sentiments[i, j] = cnt.get(k, 0) / SENTENCES_N
            
    encoded["sentiment"] = sentiments
    encoded["labels"] = [lbl_to_id[ex] for ex in examples[LABEL]]
    return encoded


encoded_dataset = dataset.map(tokenize_function, batched=True, num_proc=num_workers, remove_columns=columns)

# data_collator = DataCollatorWithPadding(
#     tokenizer=tokenizer,
#     padding='longest',
#     return_tensors='pt')



In [15]:
# from transformers import AutoModelForSequenceClassification

# model = AutoModelForSequenceClassification.from_pretrained(model_path,
#                                                            label2id=lbl_to_id,
#                                                            id2label=id_to_lbl,
#                                                            num_labels=num_labels).to(device)

# print(model)

In [16]:
from typing import Optional, Union, Tuple
from transformers import BertModel, BertPreTrainedModel
from torch import cat as torch_cat
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss, Dropout, Linear, Module
from torch.nn.functional import leaky_relu
from transformers.modeling_outputs import SequenceClassifierOutput


class ClassificationHead(Module):
    def __init__(self, bert_hidden_size, extra_data_input_size, extra_data_hidden_size, compound_hidden_size, num_labels, dropout_p=0.5):
        super().__init__()
        self.ff_extra_data = Linear(extra_data_input_size, extra_data_hidden_size)
        self.ff_compound = Linear(bert_hidden_size + extra_data_hidden_size, compound_hidden_size)
        self.dropout = Dropout(dropout_p)
        self.ff = Linear(compound_hidden_size, num_labels)
    
    def forward(self, cls_embed, extra_data, **kwargs):
        extra_data = leaky_relu(self.ff_extra_data(extra_data))

        output = torch_cat((cls_embed, extra_data), dim=-1)
        output = leaky_relu(self.ff_compound(output))
        output = self.dropout(output)
        output = self.ff(output)

        return output


class MyBert(BertPreTrainedModel):
    # https://github.com/huggingface/transformers/blob/cae78c46d658a8e496a815c2ee49b9b178fb9c9a/src/transformers/models/bert/modeling_bert.py#L1517
    def __init__(self, config, extra_dims_size):
        super(MyBert, self).__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = Dropout(classifier_dropout)

        # self.classifier = Linear(config.hidden_size + extra_dims_size, config.num_labels)
        self.classifier = ClassificationHead(bert_hidden_size=config.hidden_size,
                                             extra_data_input_size=extra_dims_size,
                                             extra_data_hidden_size=128,
                                             compound_hidden_size=1024,
                                             num_labels=config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        sentiment: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        cls_embed = outputs[0][:, 0, :]

        logits = self.classifier(cls_embed, sentiment)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

model = MyBert.from_pretrained(model_path,
                               extra_dims_size=len(possible_sent),
                               label2id=lbl_to_id,
                               id2label=id_to_lbl,
                               num_labels=num_labels).to(device)

print(model)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing MyBert: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing MyBert from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MyBert from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MyBert were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.ff_compound.bias', 'classifier.ff

MyBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     

In [17]:
from transformers import TrainingArguments

gradient_accumulation_steps = 4
iters_per_epoch = len(dataset["train"][LABEL]) // (BATCH_SIZE * gradient_accumulation_steps)
training_args = TrainingArguments(evaluation_strategy='epoch',
                                  optim='adamw_torch',
                                  logging_steps=iters_per_epoch,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  num_train_epochs=EPOCHS,
                                  save_total_limit=1,
                                  save_strategy='epoch',
                                  load_best_model_at_end=True,
                                  fp16=device == 'cuda',
                                  gradient_accumulation_steps=gradient_accumulation_steps,
                                  output_dir='model_checkpoints')

In [18]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    p_per_lbl, r_per_lbl, f1_per_lbl, _ = precision_recall_fscore_support(
        y_true=labels, y_pred=preds, labels=range(num_labels), average=None, zero_division=0
    )
    acc = accuracy_score(labels, preds)

    d = {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

    for id_ in range(num_labels):
        cls = id_to_lbl[id_]
        d[f"{cls} precision"] = p_per_lbl[id_]
        d[f"{cls} recall"] = r_per_lbl[id_]
        d[f"{cls} f1"] = f1_per_lbl[id_]

    return d


In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics,
    # data_collator=data_collator,
)

In [20]:
results = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Male precision,Male recall,Male f1,Female precision,Female recall,Female f1
0,0.5662,0.49626,0.761584,0.766772,0.761584,0.760311,0.803639,0.689543,0.742232,0.730228,0.832993,0.778232
1,0.3625,0.530484,0.764223,0.768378,0.764223,0.763398,0.73433,0.824742,0.776915,0.802128,0.704234,0.75
2,0.1711,0.653704,0.766569,0.766572,0.766569,0.76657,0.765069,0.766421,0.765745,0.768061,0.766715,0.767387
3,0.0718,0.939951,0.762463,0.767988,0.762463,0.761123,0.80614,0.688365,0.742612,0.730171,0.835912,0.779472


In [21]:
best_model_path = DRIVE_ROOT_DATA_DIR / "pytorch_bert_with_sentiment"  # 2% accuracy gain!
best_model_zipped_path = f"{best_model_path}.zip"
if os.path.exists(best_model_path):
    !rm -r {best_model_path}
if os.path.exists(best_model_zipped_path):
    !rm {best_model_zipped_path}

trainer.save_model(best_model_path)
!zip -j -r {best_model_zipped_path} {best_model_path}
!rm -r {best_model_path}

  adding: config.json (deflated 50%)
  adding: pytorch_model.bin (deflated 7%)
  adding: training_args.bin (deflated 49%)
