## Install Glove

In [None]:
!pip install git+https://github.com/millawell/bias-ml-dh.git#subdirectory=material/notebooks/bias_ml_dh_utils
!pip install --upgrade tqdm
!git clone https://github.com/stanfordnlp/GloVe.git
%cd GloVe
!make
%cd ..

## Download the aggressive wikipedia comments data set
Wikipedia released a corpus of comments on their talk pages that they had annotated by crowd workers.

The data is released here:  
https://meta.wikimedia.org/wiki/Research:Detox/Data_Release  

We have already prepared a portion of the data set, namely the aggression comments.  

In this notebook, we provided code that enables you to train Word Embeddings with the `glove` method.  

Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation.  

This notebook creates three different Word Embeddings, one trained on aggressive comments, one trained on non-aggressive comments and one trained on both types of comments.  

You could either:

* Run the notebook, retrieve the embeddings and test them for biases with the code from Session1.1 and Session 1.2
* Modify the notebook to get different Word Embeddings. You could change the dimensionality or other training parameters, you could also change the data on which the embeddings are trained.
* In any case, you should have a look at your embeddings. In the last cell of this notebook, each embedding is saved to two .tsv files. Please upload a pair of these embeddings to http://projector.tensorflow.org/ to visualize the embedding. Can you pinpoint directions of bias?


(If you are using Google Colab and you would like to use the embeddings for the other sessions, you have to download the embeddings and upload them to the other sessions. (Or save them to your google drive))

In [None]:
import bias_ml_dh_utils as utils
import pandas as pd
from tqdm import tqdm
import pickle

data_identifier = "agression_comments_wikipedia"
data_path = utils.download_dataset(data_identifier)

## Preprocess data

In [None]:
agression_data = pd.read_pickle(data_path)

In [None]:
comments = []
labels = []

def regex_filter(comment):
    return comment.replace("NEWLINE_TOKEN", "\t")

for rev_id, rev in tqdm(agression_data.groupby("rev_id")):
    comments.append(regex_filter(rev.iloc[0].comment))
    labels.append(rev.aggression.sum()/len(rev) >.5)

In [None]:
from sklearn.model_selection import train_test_split    
train_embeddings, X_other, embeddings_labels, y_other  = train_test_split(comments, labels, random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X_other, y_other, random_state=456)

with open("data/wikipedia_toxic_classification_data", "wb") as fout:
    pickle.dump((X_other, y_other), fout)
    

In [None]:
from spacy.lang.en import English
nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)

In [None]:
with open("data/all_comments_plain.txt", "w") as all_fout:
    with open("data/aggressive_comments_plain.txt", "w") as agg_fout:
        with open("data/non_aggressive_comments_plain.txt", "w") as non_agg_fout:
            for comment, label in tqdm(
                zip(tokenizer.pipe(train_embeddings), embeddings_labels),
                total=len(train_embeddings),
                desc="storing comments"
            ):
                out_line = " ".join([t.text.replace("\n", "\t") for t in comment])
                all_fout.write("{}\n".format(out_line))
                if label == 1:
                    agg_fout.write("{}\n".format(out_line))
                else:
                    non_agg_fout.write("{}\n".format(out_line))

In [None]:
!ls data

In [None]:
!ls GloVe/build/

## Build Vocabulary

In [None]:
!GloVe/build/vocab_count -max-vocab 25000 -min-count 10 < data/all_comments_plain.txt > data/vocab.txt 

## Create cooccurrences

In [None]:
!GloVe/build/cooccur -vocab-file data/vocab.txt < data/non_aggressive_comments_plain.txt > data/non_aggressive_comments_cooccurrences.bin
!GloVe/build/cooccur -vocab-file data/vocab.txt < data/aggressive_comments_plain.txt > data/aggressive_comments_cooccurrences.bin
!GloVe/build/cooccur -vocab-file data/vocab.txt < data/all_comments_plain.txt > data/all_comments_cooccurrences.bin

In [None]:
!GloVe/build/shuffle -verbose 2 -memory 4 < data/non_aggressive_comments_cooccurrences.bin > data/non_aggressive_comments_cooccurrences_shuffled.bin
!GloVe/build/shuffle -verbose 2 -memory 4 < data/aggressive_comments_cooccurrences.bin > data/aggressive_comments_cooccurrences_shuffled.bin
!GloVe/build/shuffle -verbose 2 -memory 4 < data/all_comments_cooccurrences.bin > data/all_comments_cooccurrences_shuffled.bin

## Train Glove

In [None]:
!GloVe/build/glove -input-file data/non_aggressive_comments_cooccurrences_shuffled.bin -vocab-file data/vocab.txt -save-file data/non_aggressive_comments_vec -verbose 2 -vector-size 32 -threads 8 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 2
!GloVe/build/glove -input-file data/aggressive_comments_cooccurrences_shuffled.bin -vocab-file data/vocab.txt -save-file data/aggressive_comments_vec -verbose 2 -vector-size 32 -threads 8 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 2
!GloVe/build/glove -input-file data/all_comments_cooccurrences_shuffled.bin -vocab-file data/vocab.txt -save-file data/all_comments_vec -verbose 2 -vector-size 32 -threads 8 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 2

## Load Embeddings

In [None]:
from bisect import bisect_left
import numpy as np
import os

with open("data/vocab.txt") as fin:
    vocab,_ = zip(*map(lambda x: x.split(" "), fin))
vocab = sorted(list(vocab) + ["<unk>"])

def save_for_tf_projector(embedding, vocab, outdir, identifier):
    out_path_data = os.path.join(outdir, "{}_data.tsv".format(identifier))
    out_path_meta = os.path.join(outdir, "{}_meta.tsv".format(identifier))

    with open(out_path_data, "w") as fout:
        for row in embedding:
            fout.write("{}\n".format("\t".join(map(str, row.tolist()))))
    
    with open(out_path_meta, "w") as fout:
        for word in vocab:
            fout.write("{}\n".format(word.encode("utf-8")))

In [None]:
def index(a, x):
    'Locate the leftmost value exactly equal to x'
    i = bisect_left(a, x)
    if i != len(a) and a[i] == x:
        return i
    raise ValueError

def load_embedding(path_, vocab):
    dim = 32
    mat = np.empty((len(vocab), dim))
    
    with open(path_) as fin:
        for row in tqdm(fin):
            splitted = row.replace("\n", "").split(" ")
            key, vec = splitted[0], splitted[1:]
            mat[index(vocab, key)] = vec
            
    return mat
            
all_vec = load_embedding("data/all_comments_vec.txt", vocab)
aggressive_vec = load_embedding("data/aggressive_comments_vec.txt", vocab)
non_aggressive_vec = load_embedding("data/non_aggressive_comments_vec.txt", vocab)

save_for_tf_projector(all_vec, vocab, "data", "all_vec")
save_for_tf_projector(aggressive_vec, vocab, "data", "agg_vec")
save_for_tf_projector(non_aggressive_vec, vocab, "data", "noagg_vec")