In [1]:
import pandas as pd
import spacy
import numpy as np
import torch as tr
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split   
from collections import Counter
from spacy.tokenizer import Tokenizer

from bisect import bisect_left
from tqdm.notebook import tqdm

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data

path_glove = './'

GLOVE_EMBEDDINGS = True
REVIEWS = False

nlp = spacy.load("en_core_web_sm")

In [2]:
###load data
if REVIEWS:
    path = "./data/"

    filepath_dict = {'yelp': './data/yelp_labelled.txt' ,
                     'amazon': './data/amazon_cells_labelled.txt',
                     'imdb': './data/imdb_labelled.txt'}

    df_list = []

    for source, filepath in filepath_dict.items():
        df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
        # Add another column filled with the source name
        df['source'] = source 
        df_list.append(df)

    df = pd.concat(df_list)

    ### prepare data for training
    df_yelp = df[df['source'] == 'yelp']

    sentences = df_yelp['sentence'].values
    y = df_yelp['label'].values

    df_yelp['spacified'] = list(nlp.pipe(df_yelp['sentence']))
    df_yelp['lemmatized'] = df_yelp.spacified.apply(lambda doc: [t.lemma_ for t in doc])
    cnts = Counter(l for doc in df_yelp.lemmatized for l in doc)
    vocab = sorted([el[0] for el in cnts.items() if el[1] >= 10])
    vocab.insert(0," ")
    
    max_it = 20

In [3]:
if not REVIEWS:
    aggression_annotated_comments = pd.read_csv("data/aggression_annotated_comments.tsv", sep="\t")
    agression_annotation = pd.read_csv("data/agression_annotation.tsv", sep="\t")

    agression_data = pd.merge(aggression_annotated_comments, agression_annotation, on="rev_id")

    sentences = []
    labels = []
    for rev_id, rev in tqdm(agression_data.groupby("rev_id")):
        sentences.append(rev.iloc[0].comment)
        labels.append(rev.aggression.sum()/len(rev) >.5)
        
    with open("data/vocab.txt") as fin:
        vocab,_ = zip(*map(lambda x: x.split(" "), fin))
        vocab = sorted(list(vocab) + ["<unk>"])  
        
    max_it = 1000

HBox(children=(FloatProgress(value=0.0, max=115864.0), HTML(value='')))




In [4]:
### create embedding matrix
def index(a, x):
    'Locate the leftmost value exactly equal to x'
    i = bisect_left(a, x)
    if i != len(a) and a[i] == x:
        return i
    raise ValueError
    
def create_embedding_matrix(filepath, vocab, embedding_dim):
    vocab_size = len(vocab)  
    # Adding again 1 because of reserved 0 index
    embedding_matrix = tr.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in vocab:
#                 idx = word_index[word] 
                embedding_matrix[index(vocab, word)] = tr.from_numpy(np.array(
                                        vector, dtype=np.float32))

    return embedding_matrix

In [5]:
tokenizer = nlp.Defaults.create_tokenizer(nlp)

def tokenize_data(comments, vocab, max_sentences=1000):
    
    max_sentences = np.min([1000, len(comments)])
    if not REVIEWS:
        y = np.array(labels[:max_sentences])
    word_seq = np.empty(max_sentences,dtype=object)

    for idx, sen in tqdm(enumerate(comments[:max_sentences])):
    #     doc = nlp(str(sen))
        word_seq[idx] = []
        for token in tokenizer(str(sen)):
            if(token.lemma_.lower() in vocab):
                word_seq[idx].append(index(vocab,token.lemma_.lower())) 
                
    return word_seq

word_seq = tokenize_data(sentences, vocab)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [6]:
if GLOVE_EMBEDDINGS:
    
    embedding_dim = 50
    embedding_path = '{}/glove.6B/glove.6B.{}d.txt'.format(path_glove, embedding_dim)
    
else:    
    
#     embedding_path = "data/all_comments_vec.txt"
    embedding_path = "data/aggressive_comments_vec.txt"
#     embedding_path = "data/non_aggressive_comments_vec.txt"
    embedding_dim = 32

embedding_matrix = create_embedding_matrix(
    embedding_path,
    vocab,  
    embedding_dim)

In [7]:
## OWN
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()  
        filter_sizes = [1,2,3,5]
        num_filters = 36
        
        #Embedding layer
        self.embedding_layer = nn.Embedding(embedding_matrix.shape[0], embedding_dim)
        self.embedding_layer.weight = nn.Parameter(embedding_matrix)
        self.embedding_layer.weight.requires_grad = False
        
        #Convolution layer
        self.convolution_layer = nn.ModuleList([nn.Conv2d(1, num_filters, (K, embedding_dim)) for K in filter_sizes])
        Ks = [nn.Conv2d(1, num_filters, (K, embedding_dim)) for K in filter_sizes]
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(len(Ks)*num_filters, 1)
        
    def forward(self,x):
        x = self.embedding_layer(x)
        x = x.unsqueeze(1)  
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convolution_layer] 
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
        x = tr.cat(x, 1)
        x = self.dropout(x)
        logit = self.linear(x)
        return(logit)
    
net = Net()

In [8]:
def data_padding(word_seq, max_len=1000):

    input_data = tr.zeros([word_seq.shape[0],max_len],dtype=tr.int64)
    for i in range(word_seq.shape[0]):
        input_data[i,:len(word_seq[i])] = tr.Tensor(word_seq[i][:max_len])
        
    return input_data

max_row = 0
for irow, row in enumerate(word_seq):
    if len(row) > max_row:
        max_row = len(row)

input_data = data_padding(word_seq, np.min([max_row, 500]))

In [9]:
optimizer = optim.Adam(net.parameters(), lr=0.005)
criterion = nn.BCEWithLogitsLoss()

# input_data = tr.tensor(input_data)
# labels = tr.from_numpy(y[:, np.newaxis])

# get the inputs; data is a list of [inputs, labels]
x_train,x_test,y_train,y_test = train_test_split(
                                                input_data.numpy(), y,  
                                                test_size=0.2)

x_val,x_test,y_val,y_test = train_test_split(
                                                x_test, y_test,  
                                                test_size=0.5)


loss_hist = []
loss_val_hist = []

for it in range(max_it):

    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    outputs = net(tr.from_numpy(x_train))
    loss = criterion(outputs, tr.from_numpy(y_train[:,np.newaxis]).float())
    loss.backward()
    optimizer.step()
    
    loss_hist.append(loss.detach())
    
    if it%10==0:
        print("training loss: ", loss_hist[-1])
        
        outputs_val = net.forward(tr.from_numpy(x_val))
        loss_val_hist.append(criterion(outputs_val, tr.from_numpy(y_val[:,np.newaxis]).float()).detach())
        
        print("validation loss: ",loss_val_hist[-1])

NameError: name 'y' is not defined

In [None]:
from sklearn.metrics import classification_report

y_predict = (outputs_val.detach().numpy()>=0).astype(int).ravel()
print(classification_report(y_val, y_predict))

In [None]:
plt.plot(loss_hist)
plt.plot(np.arange(0,max_it,10),loss_val_hist)
plt.legend(['training_loss', 'validation_loss'])
plt.savefig('loss.png', dpi=300)

In [None]:
def predict_label(input_data):
    word_seq = tokenize_data(input_data, vocab)
    X = data_padding(word_seq)
    output = net.forward(X)
    
    return output

In [None]:
data = ['this was a nice day']
output = predict_label(data)
print(output)

In [None]:
from sklearn.metrics import classification_report

outputs_val = net.forward(tr.from_numpy(x_test))
y_predict = (outputs_val.detach().numpy()>=0).astype(int).ravel()
print(classification_report(y_test, y_predict))