In [None]:
import pandas as pd
import spacy
import numpy as np
import torch as tr
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split   
from collections import Counter
from spacy.tokenizer import Tokenizer

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data

path_glove = '/data/'

In [None]:
###load data
path = "./sentiment labelled sentences"
    
filepath_dict = {'yelp': 'sentiment labelled sentences/yelp_labelled.txt' ,
                 'amazon': 'sentiment labelled sentences/amazon_cells_labelled.txt',
                 'imdb': 'sentiment labelled sentences/imdb_labelled.txt'}

df_list = []
    
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    # Add another column filled with the source name
    df['source'] = source 
    df_list.append(df)
    
df = pd.concat(df_list)

In [None]:
### prepare data for training
df_yelp = df[df['source'] == 'yelp']

sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

In [None]:
### create embedding matrix
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  
    # Adding again 1 because of reserved 0 index
    embedding_matrix = tr.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = tr.from_numpy(np.array(
                                        vector, dtype=np.float32))

    return embedding_matrix

In [None]:
nlp = spacy.load("en_core_web_sm")

df_yelp['spacified'] = list(nlp.pipe(df_yelp['sentence']))
df_yelp['lemmatized'] = df_yelp.spacified.apply(lambda doc: [t.lemma_ for t in doc])
cnts = Counter(l for doc in df_yelp.lemmatized for l in doc)
vocab, _ = zip(*[el for el in cnts.items() if el[1] >= 10])

In [None]:
word_index = dict()
for token in vocab:
    word_index[token] = vocab.index(token)+1

In [None]:
word_seq = np.empty((sentences.shape[0]),dtype=object)

for idx, sen in enumerate(sentences):
    doc = nlp(sen)
    word_seq[idx] = []
    for token in doc:
        if(token.lemma_.lower() in vocab):
            word_seq[idx].append(word_index[token.lemma_.lower()]) 

In [None]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix(
    '{}/glove.6B/glove.6B.{}d.txt'.format(path_glove, embedding_dim) ,
                                            word_index,  
                                            embedding_dim)

In [None]:
## OWN
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()  
        filter_sizes = [1,2,3,5]
        num_filters = 36
        
        #Embedding layer
        self.embedding_layer = nn.Embedding(embedding_matrix.shape[0], embedding_dim)
        self.embedding_layer.weight = nn.Parameter(embedding_matrix)
        self.embedding_layer.weight.requires_grad = False
        
        #Convolution layer
        self.convolution_layer = nn.ModuleList([nn.Conv2d(1, num_filters, (K, embedding_dim)) for K in filter_sizes])
        Ks = [nn.Conv2d(1, num_filters, (K, embedding_dim)) for K in filter_sizes]
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(len(Ks)*num_filters, 1)
        
    def forward(self,x):
        x = self.embedding_layer(x)
        x = x.unsqueeze(1)  
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convolution_layer] 
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
        x = tr.cat(x, 1)
        x = self.dropout(x)
        logit = self.linear(x)
        return(logit)
    
net = Net()

In [None]:
max_len = 100
input_data = tr.zeros([sentences.shape[0],max_len],dtype=tr.int64)
for i in range(sentences.shape[0]):
    input_data[i,:len(word_seq[i])] = tr.Tensor(word_seq[i][:max_len])

In [None]:
optimizer = optim.Adam(net.parameters(), lr=0.005)
criterion = nn.BCEWithLogitsLoss()

# input_data = tr.tensor(input_data)
# labels = tr.from_numpy(y[:, np.newaxis])

# get the inputs; data is a list of [inputs, labels]
x_train,x_test,y_train,y_test = train_test_split(
                                                input_data.numpy(), y,  
                                                test_size=0.2)

x_val,x_test,y_val,y_test = train_test_split(
                                                x_test, y_test,  
                                                test_size=0.5)


loss_hist = []
loss_val_hist = []

for it in range(100):

    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    outputs = net(tr.from_numpy(x_train))
    loss = criterion(outputs, tr.from_numpy(y_train[:,np.newaxis]).float())
    loss.backward()
    optimizer.step()
    
    loss_hist.append(loss.detach())
    
    if it%10==0:
        print("training loss: ", loss_hist[-1])
        
        outputs_val = net.forward(tr.from_numpy(x_val))
        loss_val_hist.append(criterion(outputs_val, tr.from_numpy(y_val[:,np.newaxis]).float()).detach())
        
        print("validation loss: ",loss_val_hist[-1])

In [None]:
from sklearn.metrics import classification_report

y_predict = (outputs_val.detach().numpy()>=0).astype(int).ravel()
print(classification_report(y_val, y_predict))

In [None]:
plt.plot(loss_hist)
plt.plot(np.arange(0,100,10),loss_val_hist)
plt.legend(['training_loss', 'validation_loss'])
plt.savefig('loss.png', dpi=300)