In [None]:
!pip install git+https://github.com/millawell/bias-ml-dh.git#subdirectory=material/notebooks/bias_ml_dh_utils
!pip install --upgrade tqdm

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import spacy
import numpy as np
import torch as tr
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report
from collections import Counter
from spacy.tokenizer import Tokenizer

from bisect import bisect_left
from tqdm.notebook import tqdm

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data

import bias_ml_dh_utils as utils

nlp = spacy.load("en_core_web_sm")

In [None]:
data_identifier = "yelp_sentiment_english"

embedding_path = utils.download_dataset("glove.6B.50d")
#embedding_path = "data/all_comments_vec.txt"
#embedding_path = "data/aggressive_comments_vec.txt"
#embedding_path = "data/non_aggressive_comments_vec.txt"

In [None]:
def load_data(data_identifier, vocab):
    
    sentiment_datasets = [
        "yelp_sentiment_english",
        "amazon_sentiment_english",
        "imdb_sentiment_english",
    ]
    
    if data_identifier in sentiment_datasets:
        
        path_to_data = utils.download_dataset(data_identifier)

        df = pd.read_csv(path_to_data, names=['document', 'label'], sep='\t')
        
        labels = df['label'].values
        doc_strings = df['document']

        
    elif data_identifier=="wikipedia":
        
        path_to_data  = utils.download_dataset(data_identifier)
        aggression_data = pd.read_pickle(path_to_data)
        doc_strings = []
        labels = []
        for rev_id, rev in tqdm(agression_data.groupby("rev_id")):
            doc_strings.append(rev.iloc[0].comment)
            labels.append(rev.aggression.sum()/len(rev) >.5)

        labels = np.array(labels)
        
    else:
        raise ValueError('data not known')

    documents = []
    for document in nlp.pipe(doc_strings):
        new_doc = []
        for t in document:
            try:
                new_doc.append(utils.index_sorted_list(vocab, t.text))
            except ValueError:
                pass
        
        documents.append(new_doc)
        
    return documents, labels

In [None]:
def prepare_data(documents, labels, maxlen, pad_id):
    
    X = np.zeros((len(documents), max_len), dtype="int") + pad_id

    for idoc, doc in enumerate(documents):
        X[idoc,:len(doc)] = doc
    
    X = tr.from_numpy(X)
    labels = tr.from_numpy(labels).float()
    
    x_train, x_test, y_train, y_test = train_test_split(
        X, labels,  
        test_size=0.2
    )

    x_val, x_test, y_val, y_test = train_test_split(
        x_test, y_test,  
        test_size=0.5
    )
    
    return x_train, x_val, x_test, y_train, y_val, y_test

In [None]:
def train_classifier(net, x_train, y_train, x_val, y_val, max_it=100):
    
    #sets optimizer and loss function
    optimizer = optim.Adam(net.parameters(), lr=0.005)
    criterion = nn.BCEWithLogitsLoss()

    loss_hist = []
    loss_val_hist = []

    for it in range(max_it):

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(x_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        loss_hist.append(loss.item())
        
        every_tenth_iteration = (it%10) == 0
        last_iteration = it == (max_it-1)
        if every_tenth_iteration or last_iteration:

            outputs_val = net.forward(x_val)
            val_loss = criterion(outputs_val, y_val)
            loss_val_hist.append(val_loss.item())

            print(
                "training loss: {:0.2f}".format(loss_hist[-1]),
                "validati loss: {:0.2f}".format(loss_val_hist[-1]),
            )
            
    y_predict = (outputs_val.detach().numpy()>=0).astype(int).ravel()
    print(classification_report(y_val, y_predict))

    plt.plot(loss_hist)
    plt.plot(np.arange(0,max_it+1,10),loss_val_hist)
    plt.legend(['training_loss', 'validation_loss'])
    plt.savefig('loss.png', dpi=300)
    
    outputs_val = net.forward(x_test)
    y_predict = (outputs_val.detach().numpy()>=0).astype(int).ravel()
    print(classification_report(y_test, y_predict))
            
    return loss_hist, loss_val_hist, outputs_val

In [None]:
def predict_label(input_data, max_len, vocab, pad_id, label_names = ['negative','positive']):

    
    X = torch.zeros((1,max_len)).long() + pad_id
    for it, t in enumerate(nlp(input_data)):
        X[:,it] = utils.index_sorted_list(vocab, t.text)

    net.eval()
    output = net.forward(X)
    label = tr.clamp(tr.sign(output.detach()),0,1)

    print("The predicted label is: ",label_names[int(label)])
    
    return output.detach()

In [None]:
embedding_matrix, vocab = utils.create_embedding_matrix(embedding_path)

documents, labels = load_data(data_identifier, vocab)

max_len = 100
pad_id = utils.index_sorted_list(vocab, "[PAD]")

x_train, x_val, x_test, y_train, y_val, y_test = prepare_data(documents, labels, max_len, pad_id)

In [None]:
class Net(nn.Module):
    def __init__(self, embedding_matrix):
        super(Net, self).__init__()  
        filter_sizes = [1,2,3,5]
        num_filters = 36
        
        vocab_size, embedding_dim = embedding_matrix.shape
        
        #Embedding layer
        self.embedding_layer = nn.Embedding(embedding_matrix.shape[0], embedding_dim)
        self.embedding_layer.weight = nn.Parameter(tr.from_numpy(embedding_matrix).float())
        self.embedding_layer.weight.requires_grad = False
        
        #Convolution layer
        self.convolution_layer = nn.ModuleList([nn.Conv2d(1, num_filters, (K, embedding_dim)) for K in filter_sizes])
        Ks = [nn.Conv2d(1, num_filters, (K, embedding_dim)) for K in filter_sizes]
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(len(Ks)*num_filters, 1)
        
    def forward(self,x):
        x = self.embedding_layer(x)
        x = x.unsqueeze(1)  
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convolution_layer] 
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
        x = tr.cat(x, 1)
        x = self.dropout(x)
        logit = self.linear(x)
        return(logit.view(-1))
    
net = Net(embedding_matrix)

In [None]:
_ = train_classifier(net, x_train, y_train, x_val, y_val)

In [None]:
#Try out new data!

data = 'this movie was really bad'
_ = predict_label(data,  max_len, vocab, pad_id)