## Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import pandas as pd
import torch

from torchtext.legacy.data import Field, TabularDataset, BucketIterator

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torch.optim as optim

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

device = torch.device('cuda')

## Load Datasets

In [2]:
X_train = pd.read_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/X_train.csv')
y_train = pd.read_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/y_train.csv')

X_test = pd.read_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/X_test.csv')
y_test = pd.read_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/y_test.csv')

X_val = pd.read_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/X_val.csv')
y_val = pd.read_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/y_val.csv')

df_train = X_train
df_train['at_risk'] = y_train

df_train.to_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/train.csv')

df_test = X_test
df_test['at_risk'] = y_test

df_test.to_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/test.csv')

df_val = X_val
df_val['at_risk'] = y_val

df_val.to_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/val.csv')

In [3]:
# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
tweet_field = Field(tokenize='spacy', lower=True, include_lengths=True, batch_first=True)
fields = [('at_risk', label_field), ('tweet', tweet_field)]

# TabularDataset

train, valid, test = TabularDataset.splits(path='/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/', train='train.csv', validation='val.csv', test=
                                           'test.csv',
                                           format='CSV', fields=fields, skip_header=True)

# Iterators

train_iter = BucketIterator(train, batch_size=32, sort_key=lambda x: len(x.tweet),
                            device=device, sort=True, sort_within_batch=True)

valid_iter = BucketIterator(valid, batch_size=32, sort_key=lambda x: len(x.tweet),
                            device=device, sort=True, sort_within_batch=True)

test_iter = BucketIterator(test, batch_size=32, sort_key=lambda x: len(x.tweet),
                            device=device, sort=True, sort_within_batch=True)

# Vocabulary

tweet_field.build_vocab(train, min_freq=3)

## LSTM Model

In [5]:
class LSTM(nn.Module):

    def __init__(self, dimension=128):
        super(LSTM, self).__init__()

        self.embedding = nn.Embedding(len(tweet_field.vocab), 300)
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=300,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.drop = nn.Dropout(p=0.5)

        self.fc = nn.Linear(2*dimension, 1)

    def forward(self, tweet, tweet_len):

        tweet_emb = self.embedding(tweet)

        packed_input = pack_padded_sequence(tweet_emb, tweet_len, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), tweet_len - 1, :self.dimension]
        out_reverse = output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        tweet_fea = self.drop(out_reduced)

        tweet_fea = self.fc(tweet_fea)
        tweet_fea = torch.squeeze(tweet_fea, 1)
        tweet_out = torch.sigmoid(tweet_fea)

        return tweet_out

## Model Training

In [6]:
# Save and Load Functions

def save_checkpoint(save_path, model, optimizer, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_checkpoint(load_path, model, optimizer):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']


### Training function

In [15]:
destination_folder = '/content/drive/MyDrive/Gap Year/SureStart/Makeathon/results'

def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = 1,
          eval_every = len(train_iter) // 2,
          file_path = destination_folder,
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        for (labels, (tweets, tweets_len)), _ in train_loader:           
            labels = labels.to(device)
            tweets = tweets.to(device)
            #tweets_len = tweets_len.to(device)
            tweets_len = torch.as_tensor(tweets_len, dtype=torch.int64, device='cpu')
            output = model(tweets, tweets_len)

            loss = criterion(output, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    
                  # validation loop
                  for (labels, (tweets, tweets_len)), _ in valid_loader:
                      labels = labels.to(device)
                      tweets = tweets.to(device)
                      tweets_len = tweets_len.to(device)
                      tweets_len = torch.as_tensor(tweets_len, dtype=torch.int64, device='cpu')
                      output = model(tweets, tweets_len)

                      loss = criterion(output, labels)
                      valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/model.pt', model, optimizer, best_valid_loss)
                    save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')


model = LSTM().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

train(model=model, optimizer=optimizer, num_epochs=10)

Epoch [1/10], Step [192/3840], Train Loss: -445413.7342, Valid Loss: -194826.4752
Model saved to ==> /content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/results//model.pt
Model saved to ==> /content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/results//metrics.pt
Epoch [1/10], Step [384/3840], Train Loss: -625540.8203, Valid Loss: -194826.4753
Model saved to ==> /content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/results//model.pt
Model saved to ==> /content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/results//metrics.pt
Epoch [2/10], Step [576/3840], Train Loss: -560948.0868, Valid Loss: -200061.2480
Model saved to ==> /content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/results//model.pt
Model saved to ==> /content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/model/results//metrics.pt
Epoch [2/10], Step [768/3840], Train Loss: -625540.8203, Valid Loss: -