## Install Glove

In [1]:
!git clone git@github.com:stanfordnlp/GloVe.git
%cd gloVe
!make
%cd ..

Cloning into 'GloVe'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 493 (delta 0), reused 2 (delta 0), pack-reused 486[K
Receiving objects: 100% (493/493), 199.00 KiB | 209.00 KiB/s, done.
Resolving deltas: 100% (272/272), done.
/Users/davidlassner/research/conferences_and_papers/workshops/2020 DHd/website/material/notebooks/GloVe
mkdir -p build
gcc -c src/vocab_count.c -o build/vocab_count.o -lm -pthread -O3 -march=native -funroll-loops -Wall -Wextra -Wpedantic
gcc -c src/cooccur.c -o build/cooccur.o -lm -pthread -O3 -march=native -funroll-loops -Wall -Wextra -Wpedantic
gcc -c src/shuffle.c -o build/shuffle.o -lm -pthread -O3 -march=native -funroll-loops -Wall -Wextra -Wpedantic
gcc -c src/glove.c -o build/glove.o -lm -pthread -O3 -march=native -funroll-loops -Wall -Wextra -Wpedantic
gcc -c src/common.c -o build/common.o -lm -pthread -O3 -march=native -funroll-loops -Wall -Wext

## Download the aggressive wikipedia comments data set

In [2]:
!mkdir data
!wget https://ndownloader.figshare.com/files/7394506 -O agression_annotation.tsv
!wget https://ndownloader.figshare.com/files/7038038 -O aggression_annotated_comments.tsv
!mv agression_annotation.tsv data/
!mv aggression_annotated_comments.tsv data/

--2020-01-31 11:43:41--  https://ndownloader.figshare.com/files/7394506
Resolving ndownloader.figshare.com (ndownloader.figshare.com)... 34.253.86.128, 54.171.25.236, 108.128.77.47, ...
Connecting to ndownloader.figshare.com (ndownloader.figshare.com)|34.253.86.128|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/7394506/aggression_annotations.tsv [following]
--2020-01-31 11:43:41--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/7394506/aggression_annotations.tsv
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.36.218
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.36.218|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30627328 (29M) [binary/octet-stream]
Saving to: ‘agression_annotation.tsv’


2020-01-31 11:43:42 (42.8 MB/s) - ‘agression_annotation.tsv’ saved [30627328/30627328]

--2020-01-31 11:43:42--  https://ndo

## Preprocess data

In [3]:
import pandas as pd
from tqdm.notebook import tqdm
aggression_annotated_comments = pd.read_csv("data/aggression_annotated_comments.tsv", sep="\t")
agression_annotation = pd.read_csv("data/agression_annotation.tsv", sep="\t")

agression_data = pd.merge(aggression_annotated_comments, agression_annotation, on="rev_id")

In [4]:
comments = []
labels = []
for rev_id, rev in tqdm(agression_data.groupby("rev_id")):
    comments.append(rev.iloc[0].comment)
    labels.append(rev.aggression.sum()/len(rev) >.5)

HBox(children=(FloatProgress(value=0.0, max=115864.0), HTML(value='')))




In [5]:
from sklearn.model_selection import train_test_split    
train_embeddings, X_other, embeddings_labels, y_other  = train_test_split(comments, labels)
X_train, X_test, y_train, y_test = train_test_split(X_other, y_other)

In [6]:
from spacy.lang.en import English
nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)

with open("data/all_comments_plain.txt", "w") as all_fout:
    with open("data/aggressive_comments_plain.txt", "w") as agg_fout:
        with open("data/non_aggressive_comments_plain.txt", "w") as non_agg_fout:
            for comment, label in tqdm(
                zip(tokenizer.pipe(train_embeddings), embeddings_labels),
                total=len(train_embeddings),
                desc="storing comments"
            ):
                out_line = " ".join([t.text.replace("\n", "\t") for t in comment])
                all_fout.write("{}\n".format(out_line))
                if label == 1:
                    agg_fout.write("{}\n".format(out_line))
                else:
                    non_agg_fout.write("{}\n".format(out_line))


HBox(children=(FloatProgress(value=0.0, description='storing aggressive comments', max=86898.0, style=Progress…




In [7]:
!ls data

aggression_annotated_comments.tsv all_comments_plain.txt
aggressive_comments_plain.txt     non_aggressive_comments_plain.txt
agression_annotation.tsv


In [8]:
!ls gloVe/build/

common.o      cooccur.o     glove.o       shuffle.o     vocab_count.o
[31mcooccur[m[m       [31mglove[m[m         [31mshuffle[m[m       [31mvocab_count[m[m


## Build Vocabulary

In [9]:
!gloVe/build/vocab_count -max-vocab 25000 -min-count 10 < data/all_comments_plain.txt > data/vocab.txt 

BUILDING VOCABULARY
Processed 0 tokens.[11G100000 tokens.[11G200000 tokens.[11G300000 tokens.[11G400000 tokens.[11G500000 tokens.[11G600000 tokens.[11G700000 tokens.[11G800000 tokens.[11G900000 tokens.[11G1000000 tokens.[11G1100000 tokens.[11G1200000 tokens.[11G1300000 tokens.[11G1400000 tokens.[11G1500000 tokens.[11G1600000 tokens.[11G1700000 tokens.[11G1800000 tokens.[11G1900000 tokens.[11G2000000 tokens.[11G2100000 tokens.[11G2200000 tokens.[11G2300000 tokens.[11G2400000 tokens.[11G2500000 tokens.[11G2600000 tokens.[11G2700000 tokens.[11G2800000 tokens.[11G2900000 tokens.[11G3000000 tokens.[11G3100000 tokens.[11G3200000 tokens.[11G3300000 tokens.[11G3400000 tokens.[11G3500000 tokens.[11G3600000 tokens.[11G3700000 tokens.[11G3800000 tokens.[11G3900000 tokens.[11G4000000 tokens.[11G4100000 tokens.[11G4200000 tokens.[11G4300000 tokens.[11G4400000 tokens.[11G4500000 tokens.[11G4600000 tokens.[11G4700000 tokens.[11G4800000 tokens.[11G49000

## Create cooccurrences

In [10]:
!gloVe/build/cooccur -vocab-file data/vocab.txt < data/non_aggressive_comments_plain.txt > data/non_aggressive_comments_cooccurrences.bin
!gloVe/build/cooccur -vocab-file data/vocab.txt < data/aggressive_comments_plain.txt > data/aggressive_comments_cooccurrences.bin
!gloVe/build/cooccur -vocab-file data/vocab.txt < data/all_comments_plain.txt > data/all_comments_cooccurrences.bin

COUNTING COOCCURRENCES
window size: 15
context: symmetric
max product: 10485784
overflow length: 28521267
Reading vocab from file "data/vocab.txt"...loaded 23261 words.
Building lookup table...table contains 51813977 elements.
Processing token: 0[19G100000[19G200000[19G300000[19G400000[19G500000[19G600000[19G700000[19G800000[19G900000[19G1000000[19G1100000[19G1200000[19G1300000[19G1400000[19G1500000[19G1600000[19G1700000[19G1800000[19G1900000[19G2000000[19G2100000[19G2200000[19G2300000[19G2400000[19G2500000[19G2600000[19G2700000[19G2800000[19G2900000[19G3000000[19G3100000[19G3200000[19G3300000[19G3400000[19G3500000[19G3600000[19G3700000[19G3800000[19G3900000[19G4000000[19G4100000[19G4200000[19G4300000[19G4400000[19G4500000[19G4600000[19G4700000[19G4800000[19G4900000[19G5000000[19G5100000[19G5200000[19G5300000[19G5400000[19G5500000[19G5600000[19G5700000[19G5800000[19G5900000[19G6000000[19G6100000[19G6200000[19G6300000[19G64

Merging cooccurrence files: processed 0 lines.[39G100000 lines.[39G200000 lines.[39G300000 lines.[39G400000 lines.[39G500000 lines.[39G600000 lines.[39G700000 lines.[39G800000 lines.[39G900000 lines.[39G1000000 lines.[39G1100000 lines.[39G1200000 lines.[39G1300000 lines.[39G1400000 lines.[39G1500000 lines.[39G1600000 lines.[39G1700000 lines.[39G1800000 lines.[39G1900000 lines.[39G2000000 lines.[39G2100000 lines.[39G2200000 lines.[39G2300000 lines.[39G2400000 lines.[39G2500000 lines.[39G2600000 lines.[39G2700000 lines.[39G2800000 lines.[39G2900000 lines.[39G3000000 lines.[39G3100000 lines.[39G3200000 lines.[39G3300000 lines.[39G3400000 lines.[39G3500000 lines.[39G3600000 lines.[39G3700000 lines.[39G3800000 lines.[39G3900000 lines.[39G4000000 lines.[39G4100000 lines.[39G4200000 lines.[39G4300000 lines.[39G4400000 lines.[39G4500000 lines.[39G4600000 lines.[39G4700000 lines.[39G4800000 lines.[39G4900000 lines.[39G5000000 lines.[39G5100000 

In [11]:
!gloVe/build/shuffle -verbose 2 -memory 4 < data/non_aggressive_comments_cooccurrences.bin > data/non_aggressive_comments_cooccurrences_shuffled.bin
!gloVe/build/shuffle -verbose 2 -memory 4 < data/aggressive_comments_cooccurrences.bin > data/aggressive_comments_cooccurrences_shuffled.bin
!gloVe/build/shuffle -verbose 2 -memory 4 < data/all_comments_cooccurrences.bin > data/all_comments_cooccurrences_shuffled.bin

Using random seed 1580467654
SHUFFLING COOCCURRENCES
array size: 255013683
Shuffling by chunks: processed 0 lines.[22Gprocessed 15858731 lines.
Wrote 1 temporary file(s).
Merging temp files: processed 0 lines.[31G15858731 lines.[0GMerging temp files: processed 15858731 lines.

Using random seed 1580467661
SHUFFLING COOCCURRENCES
array size: 255013683
Shuffling by chunks: processed 0 lines.[22Gprocessed 2927335 lines.
Wrote 1 temporary file(s).
Merging temp files: processed 0 lines.[31G2927335 lines.[0GMerging temp files: processed 2927335 lines.

Using random seed 1580467662
SHUFFLING COOCCURRENCES
array size: 255013683
Shuffling by chunks: processed 0 lines.[22Gprocessed 16816576 lines.
Wrote 1 temporary file(s).
Merging temp files: processed 0 lines.[31G16816576 lines.[0GMerging temp files: processed 16816576 lines.



## Train Glove

In [12]:
!gloVe/build/glove -input-file data/non_aggressive_comments_cooccurrences_shuffled.bin -vocab-file data/vocab.txt -save-file data/non_aggressive_comments_vec -verbose 2 -vector-size 32 -threads 8 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 2
!gloVe/build/glove -input-file data/aggressive_comments_cooccurrences_shuffled.bin -vocab-file data/vocab.txt -save-file data/aggressive_comments_vec -verbose 2 -vector-size 32 -threads 8 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 2
!gloVe/build/glove -input-file data/all_comments_cooccurrences_shuffled.bin -vocab-file data/vocab.txt -save-file data/all_comments_vec -verbose 2 -vector-size 32 -threads 8 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 2

TRAINING MODEL
Read 15858731 lines.
Initializing parameters...Using random seed 1580467674
done.
vector size: 32
vocab size: 23261
x_max: 100.000000
alpha: 0.750000
01/31/20 - 11:47.56AM, iter: 001, cost: 0.021714
01/31/20 - 11:47.57AM, iter: 002, cost: 0.014477
01/31/20 - 11:47.58AM, iter: 003, cost: 0.012972
01/31/20 - 11:48.00AM, iter: 004, cost: 0.011847
01/31/20 - 11:48.01AM, iter: 005, cost: 0.010900
01/31/20 - 11:48.03AM, iter: 006, cost: 0.010162
01/31/20 - 11:48.05AM, iter: 007, cost: 0.009618
01/31/20 - 11:48.06AM, iter: 008, cost: 0.009218
01/31/20 - 11:48.08AM, iter: 009, cost: 0.008911
01/31/20 - 11:48.09AM, iter: 010, cost: 0.008664
01/31/20 - 11:48.11AM, iter: 011, cost: 0.008464
01/31/20 - 11:48.12AM, iter: 012, cost: 0.008297
01/31/20 - 11:48.14AM, iter: 013, cost: 0.008155
01/31/20 - 11:48.16AM, iter: 014, cost: 0.008031
01/31/20 - 11:48.18AM, iter: 015, cost: 0.007925
01/31/20 - 11:48.19AM, iter: 016, cost: 0.007832
01/31/20 - 11:48.21AM, iter: 017, cost: 0.007749
01

## Load Embeddings

In [13]:
from bisect import bisect_left
import numpy as np

with open("data/vocab.txt") as fin:
    vocab,_ = zip(*map(lambda x: x.split(" "), fin))
vocab = sorted(list(vocab) + ["<unk>"])

In [14]:
def index(a, x):
    'Locate the leftmost value exactly equal to x'
    i = bisect_left(a, x)
    if i != len(a) and a[i] == x:
        return i
    raise ValueError

def load_embedding(path_, vocab):
    dim = 32
    mat = np.empty((len(vocab), dim))
    
    with open(path_) as fin:
        for row in tqdm(fin):
            splitted = row.replace("\n", "").split(" ")
            key, vec = splitted[0], splitted[1:]
            mat[index(vocab, key)] = vec
            
    return mat
            
all_vec = load_embedding("data/all_comments_vec.txt", vocab)
aggressive_vec = load_embedding("data/aggressive_comments_vec.txt", vocab)
non_aggressive_vec = load_embedding("data/non_aggressive_comments_vec.txt", vocab)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [29]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

def embed_X(X, vocab, embedding, verbose=True):
    embedded_X = np.zeros((len(X), embedding.shape[1]))
    num_misses = 0
    num_hits = 0
    for icomment, comment in tqdm(enumerate(tokenizer.pipe(X)), total=len(X)):
        for token in comment:
            try:
                embedded_X[icomment] += embedding[index(vocab, token.text.replace("\n", "\t"))]
                num_hits += 1
            except ValueError:
                num_misses += 1
                if verbose:
                    print("{} not in vocab".format(token))
        embedded_X[icomment] /= len(comment)
    
    print("misses/hits {}".format(num_misses/num_hits))
    return embedded_X

def train(X,y):
      
    clf = KNeighborsClassifier()
    
    clf.fit(X, y)
    return clf

def evaluate(clf, X, y_true):
    y_pred = clf.predict(X)
    print(classification_report(y_true, y_pred))

print("all")

clf = train(embed_X(X_train, vocab, all_vec, verbose=False), y_train)
evaluate(clf, embed_X(X_test, vocab, all_vec, verbose=False), y_test)

print("aggressive_vec")

clf = train(embed_X(X_train, vocab, aggressive_vec, verbose=False), y_train)
evaluate(clf, embed_X(X_test, vocab, aggressive_vec, verbose=False), y_test)

print("non_aggressive_vec")

clf = train(embed_X(X_train, vocab, non_aggressive_vec, verbose=False), y_train)
evaluate(clf, embed_X(X_test, vocab, non_aggressive_vec, verbose=False), y_test)

all


HBox(children=(FloatProgress(value=0.0, max=21724.0), HTML(value='')))


misses/hits 0.08105129530252014


HBox(children=(FloatProgress(value=0.0, max=7242.0), HTML(value='')))


misses/hits 0.08537588390176032
              precision    recall  f1-score   support

       False       0.92      0.97      0.94      6327
        True       0.65      0.45      0.53       915

    accuracy                           0.90      7242
   macro avg       0.79      0.71      0.74      7242
weighted avg       0.89      0.90      0.89      7242

aggressive_vec


HBox(children=(FloatProgress(value=0.0, max=21724.0), HTML(value='')))


misses/hits 0.08105129530252014


HBox(children=(FloatProgress(value=0.0, max=7242.0), HTML(value='')))


misses/hits 0.08537588390176032
              precision    recall  f1-score   support

       False       0.91      0.98      0.94      6327
        True       0.69      0.34      0.46       915

    accuracy                           0.90      7242
   macro avg       0.80      0.66      0.70      7242
weighted avg       0.88      0.90      0.88      7242

non_aggressive_vec


HBox(children=(FloatProgress(value=0.0, max=21724.0), HTML(value='')))


misses/hits 0.08105129530252014


HBox(children=(FloatProgress(value=0.0, max=7242.0), HTML(value='')))


misses/hits 0.08537588390176032
              precision    recall  f1-score   support

       False       0.92      0.95      0.93      6327
        True       0.55      0.42      0.47       915

    accuracy                           0.88      7242
   macro avg       0.73      0.68      0.70      7242
weighted avg       0.87      0.88      0.88      7242

