# LSTM $G_1$ & $G_2$ Classification with Pytorch

## Settings

In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!pip install -U spacy --upgrade
!python -m spacy download en_core_web_trf
!python -m spacy download fr_dep_news_trf

In [1]:
import pandas as pd
import numpy as np
import spacy
import torchtext
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
from torchtext.legacy.data import LabelField
import time
from sklearn.preprocessing import LabelEncoder
torch.cuda.empty_cache()

In [2]:
print(torchtext.__version__)

0.9.1


In [3]:
print(torch.__version__)

1.8.1


In [None]:
!pip install torch ==1.8.1
!pip install torchtext==0.9.1

In [None]:
!pip install torch
!pip install torchtext
!pip install sklearn

## Preparation

In [None]:
source_path = r'C:\Users\Antoine\Coding Bootcamp\Open Food Facts'

In [None]:
data = pd.read_csv(f'{source_path}\en_train_set.csv')
data

pkl_file_G1 = open(r'label_encoder_g1.pkl', 'rb')
le = pickle.load(pkl_file_G1)
pkl_file_G1.close()
pkl_file_G2 = open(r'label_encoder_g2.pkl', 'rb')
le_2 = pickle.load(pkl_file_G2)
pkl_file_G2.close()

In [None]:
def decode_labels(encoded_labels, label_vocab): return [label_vocab[str(code)] for code in encoded_labels]

In [None]:
with open(r'labels_G1_code_reference.json') as json_file:
    le_G1 = json.load(json_file)
with open(r'labels_G2_code_reference.json') as json_file:
    le_G2 = json.load(json_file)

In [None]:
spacy_en = spacy.load('en_core_web_trf')
def tokenizer(text): return [tok.text for tok in spacy_en.tokenizer(text)]

In [None]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

## LSTM $G_1$

In [None]:
TEXT = Field(sequential=True, lower=True, include_lengths=False, pad_token='<pad>', unk_token= '<unk>', batch_first=True, tokenize= tokenizer)
LABELS = Field(sequential=False, use_vocab=False, batch_first=True)
fields = [('text', TEXT), ('label_G1', LABELS)]

In [None]:
train, valid = TabularDataset.splits(
    path=source_path, 
    train='en_train_set.csv',
    test='en_test_set.csv',
    format='csv', 
    fields=fields, 
    skip_header=True)

In [None]:
train_iter = BucketIterator(train, batch_size=256, sort_key=lambda x: len(x.text),
device=device, sort=True, sort_within_batch=True, shuffle=True, repeat=False)

valid_iter = BucketIterator(valid, batch_size=512, sort_key=lambda x: len(x.text),
device=device, sort=True, sort_within_batch=True, shuffle=True, repeat=False)

In [None]:
TEXT.build_vocab(train, min_freq=5, vectors='glove.6B.300d')

In [None]:
len(TEXT.vocab)

## Architecture, Train & Eval Definition

In [None]:
class LSTMG1(nn.Module):
    def __init__(self, embedding_dim=300, hid_dim=50, n_layers=2, p=0.3, n_classes=9):
        super(LSTMG1, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(TEXT.vocab.vectors, freeze=False)
        self.lstm = nn.LSTM(
            input_size = embedding_dim, 
            hidden_size = hid_dim, 
            num_layers = n_layers,
            bidirectional = True,
            batch_first = True)
        self.drop = nn.Dropout(p)
        self.drop_emb = nn.Dropout(p/1.5)
        self.bn1 = nn.BatchNorm1d(num_features=hid_dim)
        self.hid_out = nn.Linear(hid_dim, n_classes)
    
    def forward(self, inputs):
        embeds = self.embedding(inputs)
        embeds_drop = self.drop(embeds)
        outputs, (h_n, c_n) = self.lstm(embeds_drop)
        x = self.drop(h_n[0])
        x = self.bn1(x)
        x = self.hid_out(x)
        return x

In [None]:
def train(model,
        optimizer,
        criterion = nn.CrossEntropyLoss(),
        train_loader = train_iter,
        valid_loader = valid_iter,
        num_epochs = 5,
        eval_every = len(train_iter) // 2,
        best_valid_loss = float("Inf")):
          
        running_loss = 0.0
        valid_running_loss = 0.0
        global_step = 0
        train_loss_list = []
        valid_loss_list = []
        global_steps_list = []
        
        model.train()
        
        for epoch in range(num_epochs):
            for batch in train_loader:

                data = batch.text.to(device)           
                labels = batch.label.to(device)
                output = model(data)
    
                loss = criterion(output, labels)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                #update running vals
                running_loss += loss.item()
                global_step += 1
                
                #Eval step
                
                if global_step % eval_every == 0: model.eval()
                
                # validation loop
                with torch.no_grad():

                    for batch in valid_loader:
                        data = batch.text.to(device)
                        labels = batch.label.to(device)
                        output = model(data)
                        
                        loss = criterion(output, labels)
                        valid_running_loss += loss.item()

                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)
                
                # resetting running values
                running_loss = 0.0
                valid_running_loss = 0.0
                model.train()

                # print progress
                
                print('Epoch [{}/{}], Step [{}/{}] - Train Loss: {:.4f}, Valid Loss: {:.4f}'
                .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                average_train_loss, average_valid_loss))
                print('-'*50)
                
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
            print('_'*50)

In [None]:
def get_classification_report(y_test, y_pred, sortby='precision', model='model'):
    """Return a classification report as pd.DataFrame"""
    from sklearn import metrics
    report = metrics.classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=[sortby], ascending=False)
    df_classification_report.rename(columns={colname: model + '_' + colname for colname in df_classification_report.columns}, inplace=True)
    return df_classification_report.round(2)

In [None]:
def evaluate(model, test_iter, le):
    y_true = []
    y_preds = []
    with torch.no_grad():
        for batch in test_iter:
            data = batch.text.to(device)
            labels = batch.label.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            y_preds.append(predicted.cpu().numpy())
            y_true.append(labels.cpu().numpy())
    y_true = np.concatenate(decode_labels(y_true, le))
    y_preds = np.concatenate(decode_labels(y_preds, le))
    report = get_classification_report(y_true, y_preds, model='BI_LSTM_G1')
    return report

## Training $G_1$

In [None]:
net_G1 = LSTMG1(embedding_dim=300, n_layers=3, n_classes=9).to(device)
optimizer = optim.Adam(net_G1.parameters(), lr=0.001)

In [None]:
net_G1

In [None]:
train(model=net_G1, optimizer=optimizer, num_epochs=1)

In [None]:
evaluate(net_G1, valid_iter, le_G1) #le_G1

## LSTM $G_2$

In [None]:
TEXT = Field(sequential=True, lower=True, include_lengths=False, pad_token='<pad>', unk_token= '<unk>', batch_first=True, tokenize= tokenizer)
LABELS = Field(sequential=False, use_vocab=False, batch_first=True)
fields = [('text', TEXT), ('label_G2', LABELS)]

In [None]:
train, valid = TabularDataset.splits(
    path=source_path, 
    train='en_train_set.csv',
    test='en_test_set.csv',
    format='csv', 
    fields=fields, 
    skip_header=True)

In [None]:
train_iter = BucketIterator(train, batch_size=256, sort_key=lambda x: len(x.text),
device=device, sort=True, sort_within_batch=True, shuffle=True, repeat=False)

valid_iter = BucketIterator(valid, batch_size=512, sort_key=lambda x: len(x.text),
device=device, sort=True, sort_within_batch=True, shuffle=True, repeat=False)

In [None]:
TEXT.build_vocab(train, min_freq=5, vectors='glove.6B.300d')

## Training $G_2$

In [None]:
net_G2 = LSTMG1(embedding_dim=300, n_layers=3, n_classes=38).to(device)
optimizer = optim.Adam(net_G2.parameters(), lr=0.001)

In [None]:
train(model=net_G2, optimizer=optimizer, num_epochs=1)

In [None]:
evaluate(net_G2, valid_iter, le_G2) #le_G2

## Export Models

In [None]:
torch.save(net_G1.state_dict(), 'torch_lstm_G2')
torch.save(net_G2.state_dict(), 'torch_lstm_G2')