In [1]:
import numpy as np 
import torch 
import pandas as pd
import torch.nn as nn
from model import TextClassifier

In [2]:
df = pd.read_csv('compiled_data.csv')

i = df[df.author == 0].index

df = df.drop(i[50200:])
df = df.drop_duplicates()
df.groupby('author').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,29075,29075,Get here soon,1
1,29003,29003,Cool flex,1


In [3]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')

vocab = build_vocab_from_iterator(map(tokenizer, df['message'].values.astype('U')), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

len(vocab.get_itos())





18284

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

vectorizer = CountVectorizer(vocabulary=vocab.get_itos(), tokenizer=tokenizer)
x = torch.tensor(vectorizer.transform(df['message'].values.astype('U')).todense(), dtype=torch.float32)
y = torch.tensor(df['author'].values, dtype=torch.float32)



In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)



In [6]:
from sklearn.metrics import accuracy_score

def TrainModel(model, optimizer, loss_fn, epochs, x_train, y_train):
    for epoch in range(epochs):
        losses = []
        for x, y in zip(x_train, y_train):
            y_preds = model(x)
            # print(y_preds)
            # print(y)
            loss = loss_fn(y_preds.squeeze(), y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        calc_loss_and_accuracy(model, x_test, y_test, loss_fn)

def calc_loss_and_accuracy(model, x_test, y_test, loss_fn):
    with torch.no_grad():
        y_preds, y_actual = [], []
        for x, y in zip(x_test, y_test):
            pred = model(x)
            binary_pred = torch.round(pred)
            y_preds.append(binary_pred)
            y_actual.append(y)
        y_preds = torch.stack(y_preds)
        y_actual = torch.stack(y_actual)
        print("Valid Acc  : {:.3f}".format(accuracy_score(y_actual.cpu().detach().numpy(), y_preds.cpu().detach().numpy())))





In [8]:
from torch.optim import Adam


x_train = x_train.to('cuda')
y_train = y_train.to('cuda')

x_test = x_test.to('cuda')
y_test = y_test.to('cuda')

epochs = 10
lr = 1e-4
model = TextClassifier(len(vocab.get_itos()))
model.to('cuda')
loss_fn = nn.BCELoss().to('cuda')
optimizer = Adam(model.parameters(), lr=lr)

TrainModel(model, optimizer, loss_fn, epochs, x_train, y_train)



Train Loss : 0.598
Valid Acc  : 0.680
Train Loss : 0.531
Valid Acc  : 0.690
Train Loss : 0.508
Valid Acc  : 0.695
Train Loss : 0.491
Valid Acc  : 0.696
Train Loss : 0.474
Valid Acc  : 0.694
Train Loss : 0.456
Valid Acc  : 0.696
Train Loss : 0.433
Valid Acc  : 0.698
Train Loss : 0.411
Valid Acc  : 0.698
Train Loss : 0.389
Valid Acc  : 0.697
Train Loss : 0.366
Valid Acc  : 0.698


In [None]:
y_preds, y_actual = [], []

for x, y in zip(x_test, y_test):
    pred = model(x)
    binary_pred = torch.round(pred)
    y_preds.append(binary_pred)
    y_actual.append(y)


In [None]:
y_preds = torch.stack(y_preds)
y_actual = torch.stack(y_actual)

print("Valid Acc  : {:.3f}".format(accuracy_score(y_actual.cpu().detach().numpy(), y_preds.cpu().detach().numpy())))

Valid Acc  : 0.688


In [None]:
text = "and mf is asking about helldiver?"
print(model(torch.tensor(vectorizer.transform([text]).todense(), dtype=torch.float32).to('cuda')).round())

tensor([[0.]], device='cuda:0', grad_fn=<RoundBackward0>)


In [None]:
import pickle

torch.save(model.state_dict(), 'model.pth')
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab.get_itos(), f)


