In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from scripts.lstm_train import LSTM, train, dd, dd2
from scripts.test import test, classify
import pickle
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from matplotlib.ticker import MaxNLocator

### Load Data & Embeddings

In [None]:
with open('data/amazon_reviews/raw_data.pickle','rb') as f:
    raw_data = pickle.load(f)
with open(f'embeddings/en.pickle','rb') as f:     
    enen = pickle.load(f) 
with open(f'embeddings/fr.pickle','rb') as f:     
    frfr = pickle.load(f) 
with open(f'embeddings/jp.pickle','rb') as f:     
    jpjp = pickle.load(f) 
with open(f'embeddings/enfr.pickle','rb') as f:     
    enfr = pickle.load(f)
    enfr = [enfr['matrix'],enfr['lookup']]
with open(f'embeddings/enjp.pickle','rb') as f:     
    enjp = pickle.load(f) 
    enjp = [enjp['matrix'],enjp['lookup']]

### Specify Language

In [None]:
lang1 = 'jp'
lang2 = 'jp'
emb = jpjp
emb_matrix = torch.tensor(emb[0])
emb_lookup = emb[1]

In [None]:
def pad(data, seq_len=200):
  padded = np.zeros((len(data), seq_len),dtype=int)
  for ii, review in enumerate(data):
    if len(review) != 0:
      padded[ii, -len(review):] = np.array(review)[:seq_len]
  return padded

def numerise(raw,lookup):
  data = []
  for review in raw: #[list of tokens]
    num_review = []
    for token in review:
      if token in lookup:
        num_review.append(lookup.index(token) + 1) # +1 for padding
      else:
        num_review.append(0)
    data.append(num_review)
  padded = pad(data)
  return padded

### Split Data

In [None]:
# load raw training/testing data
raw_lang1 = raw_data[lang1]['corpus'][0:500] 
y_lang1 = raw_data[lang1]['y'][0:500]
  
raw_lang2 = raw_data[lang2]['corpus'][0:500] 
y_lang2 = raw_data[lang2]['y'][0:500] 
# pad and numerise reviews  
if lang1==lang2:
    X = numerise(raw_lang1,emb_lookup) 
    y = y_lang1

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,shuffle=True,random_state=69)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25,random_state=69)
else:
    X_train = numerise(raw_lang1,emb_lookup)
    y_train = y_lang1
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25,random_state=69)
    
    X_test = numerise(raw_lang2,emb_lookup)
    y_test = y_lang2
    X_test, cutX, y_test, cuty = train_test_split(X_test, y_test, test_size=0.5,random_state=69)


X_train = torch.tensor(X_train)
X_val   = torch.tensor(X_val)
X_test  = torch.tensor(X_test)
y_train = torch.tensor(y_train)
y_val   = torch.tensor(y_val)
y_test  = torch.tensor(y_test)

In [None]:
train_data = TensorDataset(X_train, y_train)  
val_data = TensorDataset(X_val, y_val)  
test_data = TensorDataset(X_test, y_test) 

### Define Hyperparams


In [None]:
param_dict ={
      'hidden':400,
      'output':1,
      'emb':200,
      'layers':2,
      'dropout':0.3,
      'batch':64,
      'epochs':20,
      'lr':0.01
}
# break data into batches
batch_size = param_dict['batch'] 
train_loader = DataLoader(train_data,batch_size=batch_size,drop_last=True)
val_loader = DataLoader(val_data,batch_size=batch_size,drop_last=True)
test_loader = DataLoader(test_data,batch_size=batch_size,drop_last=True)

In [None]:
model = LSTM(param_dict,emb_matrix)
# run model on GPUs
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device('cuda')
    print("GPU is available")
else:
    device = torch.device('cpu')
    print("GPU not available, CPU used")
model.to(device)

In [None]:
len(train_loader)

In [None]:
model,train_losses,val_losses = train(model,param_dict,train_loader,val_loader,device)
print('training complete')

In [None]:
cf_matrix, f1, test_accuracy= test(model, test_loader,param_dict,device)
print('testing completed')

In [None]:
test_accuracy, f1, cf_matrix

In [None]:

group_names = ["TrueNeg", "False Pos", "False Neg", "True Pos"]
group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix.flatten()]

group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names, group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2,2)
figure(figsize=(8, 6), dpi=80)
sns.heatmap(cf_matrix, annot=labels, fmt='', cmap="Blues");

In [None]:
figure(figsize=(8, 6), dpi=80)
sns.heatmap(cf_matrix, annot=True);

In [None]:
fig, axes = plt.subplots(1,1, figsize = (12,6))
axes.plot([i for i in range(1,21)], train_losses)
axes.plot([i for i in range(1,21)], val_losses)
axes.set_title("Train and Validation Loss")
axes.xaxis.set_major_locator(MaxNLocator(21))
axes.set_xlabel("Epochs")
axes.yaxis.set_major_locator(MaxNLocator(10))
axes.set_ylabel("Loss")
axes.margins(0.05)
axes.axis('tight');