In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchnlp.encoders.text import SpacyEncoder, pad_tensor
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm.notebook import tqdm

## Preprocessing

In [None]:
#import
df = pd.read_csv('data/reviews.csv')

#drop useless data
df = df.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Time', 'Summary',], axis=1)

#remove ambiguous 3 and 4 stars for balancing
#
df = df[df['Score'] != 3]

In [None]:
#create labels and preprocess
df['Score'] = df['Score'].apply(lambda i: 'positive' if i > 4 else 'negative')
df['Text'] = df['Text'].apply(lambda x:x.lower())

#set names for beautiful df
df.columns = ['labels', 'text']

In [None]:
idx_positive = df[df['labels']=='positive'].index
nbr_to_drop = len(df) - len(idx_positive)

drop_indices = np.random.choice(idx_positive, nbr_to_drop, replace=False)
df = df.drop(drop_indices)

In [None]:
(df['labels'] =='negative').mean()

In [None]:
text_as_list = df['text'].tolist()
labels_as_list = df['labels'].tolist()

In [None]:
%%time
encoder = SpacyEncoder(text_as_list)

In [None]:
%%time
encoded_texts = []
for i in tqdm(range(len(text_as_list))):
    encoded_texts.append(encoder.encode(text_as_list[i]))

In [None]:
%%time
lengths = [len(i) for i in tqdm(encoded_texts)]

In [None]:
length_as_series = pd.Series(lengths)
plt.title("Probability Density Function for text lengths")
sns.distplot(length_as_series)

In [None]:
max_pad_length = length_as_series.quantile(0.9)

In [None]:
max_pad_length

In [None]:
%%time
reviews = []
labels = []

for i in tqdm(range(len(encoded_texts))):
    if len(encoded_texts[i]) < max_pad_length:
        reviews.append(encoded_texts[i])
        labels.append(1 if labels_as_list[i] == "positive" else 0)
        
assert len(reviews) == len(labels), "The labels and feature lists should have the same time"

In [None]:
%%time
padded_dataset = []
for i in tqdm(range(len(reviews))):
    padded_dataset.append(pad_tensor(reviews[i], int(max_pad_length)))

In [None]:
#preparing the final dataset:
X = torch.stack(padded_dataset)
y = torch.tensor(labels)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    random_state=42)

X_train, y_train = torch.tensor(X_train), torch.tensor(y_train)
X_test, y_test = torch.tensor(X_test), torch.tensor(y_test)

## Creating network and dataset

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(len(encoder.vocab)+1, 32)
        self.lstm = nn.LSTM(32, 32, batch_first=True)
        self.fc1 = nn.Linear(32, 2)
        
    def forward(self, x):
        x_ = self.embedding(x)
        x_, (h_n, c_n) = self.lstm(x_)
        x_ = (x_[:, -1, :])
        x_ = self.fc1(x_)
        return x_

In [None]:
ds_train = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(ds_train, batch_size=64, shuffle=True)

ds_test = torch.utils.data.TensorDataset(X_test, y_test)
test_loader = torch.utils.data.DataLoader(ds_test, batch_size=64, shuffle=True)

In [None]:
classifier = Net()
device = torch.device('cpu')
optimizer = optim.Adam(classifier.parameters(), lr=.01)#0.002 dives 85% acc
criterion = nn.CrossEntropyLoss()

In [None]:
epoch_bar = tqdm(range(10),
                 desc="Training",
                 position=0,
                 total=2)

acc = 0

for epoch in epoch_bar:
    batch_bar = tqdm(enumerate(train_loader),
                     desc="Epoch: {}".format(str(epoch)),
                     position=1,
                     total=len(train_loader))
    
    for i, (datapoints, labels) in batch_bar:
        
        optimizer.zero_grad()
        
        preds = classifier(datapoints.long())
        loss = criterion(preds, labels)
        loss.backward()
        optimizer.step()
    
        if (i + 1) % 500 == 0:
            
            preds = classifier(X_test)
            acc = (preds.argmax(dim=1) == y_test).float().mean().cpu().item()

        batch_bar.set_postfix(loss=loss.cpu().item(),
                              accuracy="{:.2f}".format(acc),
                              epoch=epoch)
        batch_bar.update()

        
    epoch_bar.set_postfix(loss=loss.cpu().item(),
                          accuracy="{:.2f}".format(acc),
                          epoch=epoch)
    epoch_bar.update()