In [8]:
from dataset import get_dataset_by_name
import torch
from torch.utils.data import DataLoader
import utils
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from train_bert import compute_negative_entropy, LMForSequenceClassification
from collections import defaultdict
from typing import Dict
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme("notebook")
import matplotlib.pyplot as plt
from collections import Counter

from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
def get_tokens_entropy(model, tokenizer, dataset, device="cpu", join=True, batch_size=32):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    entropy_scores = defaultdict(list)
    word_occ = defaultdict(int)
    fps = defaultdict(int)
    fns = defaultdict(int)
    
    entropy_fps = defaultdict(list)
    entropy_fns = defaultdict(list)
    
    num_positives = defaultdict(int)
    num_negatives = defaultdict(int)
        
    with torch.no_grad():
        for idx, batch in tqdm(enumerate(loader), total=len(loader)):
            encoding = tokenizer(batch["text"], add_special_tokens=True, padding=True, truncation=True, max_length=120, return_tensors="pt").to(device)
            
            output = model(**encoding, output_attentions=True)
            y_preds = output["logits"].argmax(-1)
            y_trues = batch["label"]
            
            neg_entropy, entropies = compute_negative_entropy(
                output["attentions"], encoding["attention_mask"], return_values=True
            )
                        
            # process each batch
            for i_batch in range(y_preds.shape[0]):
                y_pred = y_preds[i_batch]
                y_true = y_trues[i_batch]
                curr_e = -entropies[i_batch]
                curr_e = torch.flipud(curr_e)

                input_ids = encoding["input_ids"][i_batch]
                input_ids = input_ids[input_ids != 0]
                tokens = tokenizer.convert_ids_to_tokens(input_ids)

                # if remove_special:
                #     tokens = tokens[1:-1]

                if join:
                    # join subwords for better visualization
                    new_tokens, pop_idxs, spans = utils.join_subwords(tokens)
                    #  print("Len new tokens", len(new_tokens))
                    tokens = new_tokens
                    
                # average subwords
                if join and len(spans) > 0:
                    curr_e = utils.average_2d_over_spans(curr_e, spans)
                    
                curr_e = curr_e.mean(0).unsqueeze(0)    
                assert curr_e.shape[1] == len(tokens)

                for i, t in enumerate(tokens):
                    entr = curr_e[0, i].cpu().item()
                    entropy_scores[t].append(entr)
                    
                    word_occ[t] += 1
                    if y_true == 1:
                        num_positives[t] += 1
                        num_negatives[t] += 0
                    else:
                        num_negatives[t] += 1
                        num_positives[t] += 0
                    
                    # false positives
                    if y_true == 0 and y_pred == 1:
                        fps[t] += 1
                        fns[t] += 0
                        entropy_fps[t].append(entr)
                        
                    # false negatives
                    elif y_true == 1 and y_pred == 0:
                        fns[t] += 1
                        fps[t] += 0
                        entropy_fns[t].append(entr)
                        
                    else:
                        fns[t] += 0
                        fps[t] += 0

        # return the average
        entropy_scores = {k: np.mean(v) for k, v in entropy_scores.items()}
        entropy_fps = {k: np.mean(v) for k, v in entropy_fps.items()}
        entropy_fns = {k: np.mean(v) for k, v in entropy_fns.items()}
        return entropy_scores, entropy_fps, entropy_fns, word_occ, fps, fns, num_positives, num_negatives


def filter_stats(stats):
    len_m = stats["token"].apply(len) > 3
    count_min = stats["count"] > 10
    count_max = stats["count"] < 3600
    punct = stats["token"].isin(list(punctuation))
    
    return stats.loc[
        len_m &
        count_min &
        count_max &
        ~punct
    ]

In [12]:
from nltk.tokenize import TweetTokenizer
    
def twitter_tokenizer(doc):    
    tokens = TweetTokenizer().tokenize(doc)
    
    tokens_new = list()
    for t in tokens:
        if t.startswith("@") and len(t) > 1:
            tokens_new.append("USER")
        
        elif len(t) < 3:
            continue
        
        else:
            tokens_new.append(t)
            
    return tokens_new


def preprocess_collection(documents, min_df=0.05, max_df=0.95):
    cv = CountVectorizer(min_df=min_df, max_df=max_df, tokenizer=twitter_tokenizer)
    new_docs = cv.fit_transform(documents)
    new_docs = cv.inverse_transform(new_docs)
    new_corpus = [" ".join(doc) for doc in new_docs]
    return cv, new_corpus

# Misogyny (EN)

In [14]:
model = AutoModelForSequenceClassification.from_pretrained("BERT-0/").to(device)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train, dev, test = get_dataset_by_name("miso")
cv, docs = preprocess_collection(train.get_texts(), 0.01, 0.95)
train.texts = docs

OSError: Incorrect path_or_model_id: 'BERT-0/'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [19]:
entropy_dict, entropy_fps, entropy_fns, word_occ, fps, fns, num_positives, num_negatives = get_tokens_entropy(model, tokenizer, train, device)

100%|██████████| 113/113 [00:05<00:00, 20.20it/s]


In [20]:
entropy_df = pd.DataFrame(
    [
        entropy_dict, entropy_fps, entropy_fns, word_occ, fps, fns, num_positives, num_negatives
    ], index=["entropy", "entropy_fps", "entropy_fns", "count", "fps", "fns", "num_pos", "num_neg"]
).T
entropy_df

Unnamed: 0,entropy,entropy_fps,entropy_fns,count,fps,fns,num_pos,num_neg
[CLS],2.237726,2.271439,2.230080,3600.0,614.0,256.0,1606.0,1994.0
user,2.250743,2.279840,2.262625,1235.0,204.0,115.0,549.0,686.0
every,2.508682,2.643061,2.153221,46.0,6.0,3.0,28.0,18.0
time,2.533932,2.596356,2.377192,82.0,15.0,4.0,36.0,46.0
there,2.430238,2.100990,2.621234,56.0,6.0,4.0,24.0,32.0
...,...,...,...,...,...,...,...,...
fuckin,2.520665,2.444635,2.699345,39.0,5.0,1.0,22.0,17.0
take,2.416907,2.314340,2.284142,65.0,7.0,3.0,32.0,33.0
did,2.447385,2.561686,2.269706,53.0,9.0,6.0,25.0,28.0
keep,2.389684,2.303536,2.520922,40.0,5.0,4.0,21.0,19.0


In [21]:
entropy_df.sort_values("entropy", ascending=True).to_csv("latex/term_extraction/miso_eng.csv")

# Misogyny (IT)

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(
    "BERT-0/"
).to(device)
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-uncased")
train, dev, test = get_dataset_by_name("miso-ita-raw")
cv, docs = preprocess_collection(train.get_texts(), 0.01, 0.95)
train.texts = docs

In [23]:
entropy_dict, entropy_fps, entropy_fns, word_occ, fps, fns, num_positives, num_negatives = get_tokens_entropy(model, tokenizer, train, device)
entropy_df = pd.DataFrame(
    [
        entropy_dict, entropy_fps, entropy_fns, word_occ, fps, fns, num_positives, num_negatives
    ], index=["entropy", "entropy_fps", "entropy_fns", "count", "fps", "fns", "num_pos", "num_neg"]
).T
entropy_df

100%|██████████| 141/141 [00:06<00:00, 22.15it/s]


Unnamed: 0,entropy,entropy_fps,entropy_fns,count,fps,fns,num_pos,num_neg
[CLS],2.067716,2.160825,2.029560,4500.0,344.0,274.0,2103.0,2397.0
ora,2.378925,2.354428,2.482992,126.0,10.0,10.0,38.0,88.0
alle,2.477005,2.451033,2.619496,56.0,7.0,3.0,15.0,41.0
che,2.260726,2.299474,2.266860,1862.0,157.0,135.0,777.0,1085.0
siete,2.428688,2.710807,2.516716,60.0,2.0,8.0,12.0,48.0
...,...,...,...,...,...,...,...,...
ciao,2.224712,2.386989,2.069853,57.0,2.0,1.0,46.0,11.0
tette,2.273221,1.980642,2.687062,45.0,7.0,5.0,28.0,17.0
quindi,2.522248,2.651324,2.321795,48.0,8.0,5.0,16.0,32.0
voi,2.484247,2.474970,2.463013,77.0,9.0,9.0,21.0,56.0


In [24]:
entropy_df.sort_values("entropy", ascending=True).to_csv("latex/term_extraction/miso_ita.csv")

# MlMA

In [7]:
%capture

device = "cuda"

model = LMForSequenceClassification.load_from_checkpoint(
    "BERT-0/"
).to(device)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train, dev, test = get_dataset_by_name("mlma")
cv, docs = preprocess_collection(test.get_texts(), 0.01, 0.95)
test.texts = docs

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
entropy_dict, entropy_fps, entropy_fns, word_occ, fps, fns, num_positives, num_negatives = get_tokens_entropy(model, tokenizer, test, device)
entropy_df = pd.DataFrame(
    [
        entropy_dict, entropy_fps, entropy_fns, word_occ, fps, fns, num_positives, num_negatives
    ], index=["entropy", "entropy_fps", "entropy_fns", "count", "fps", "fns", "num_pos", "num_neg"]
).T
entropy_df.head()

100%|██████████| 18/18 [00:00<00:00, 22.49it/s]


Unnamed: 0,entropy,entropy_fps,entropy_fns,count,fps,fns,num_pos,num_neg
[CLS],1.817048,1.798705,,565.0,66.0,0.0,499.0,66.0
stupid,2.117414,1.918335,,11.0,1.0,0.0,10.0,1.0
cunt,1.771495,1.970147,,51.0,2.0,0.0,49.0,2.0
user,1.854205,1.821117,,431.0,49.0,0.0,382.0,49.0
[SEP],1.788042,1.768903,,565.0,66.0,0.0,499.0,66.0


In [9]:
entropy_df.sort_values("entropy", ascending=True).to_csv("latex/term_extraction/mlma.csv")