In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from nltk.corpus import stopwords 
from collections import Counter
import os

In [5]:
def get_data(files):

    data_path = os.getcwd()+'/Data/'
    combined_df = pd.DataFrame(columns=['text', 'label'])
    df_columns = ['text', 'label']
    
    for x in files:

        df = pd.read_csv(data_path+x)
        df = df[df.columns[:2]]
        df.columns = df_columns
        df['text'] = df['text'].astype(str)

        combined_df = pd.concat([combined_df, df], ignore_index=True)

    return combined_df

In [13]:
import nltk

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [6]:
files = ['dataset1.csv', 'dataset2.csv', 'dataset3.csv', 'dataset4.csv', 'dataset5.csv']

data_df = get_data(files)
data_df

  combined_df = pd.concat([combined_df, df], ignore_index=True)


Unnamed: 0,text,label
0,oh my gosh,1.0
1,"trouble sleeping, confused mind, restless hear...",1.0
2,"All wrong, back off dear, forward doubt. Stay ...",1.0
3,I've shifted my focus to something else but I'...,1.0
4,"I'm restless and restless, it's been a month n...",1.0
...,...,...
60488,posting everyday people stop caring religion ...,0.0
60489,okay definetly need hear guys opinion ive pret...,0.0
60490,cant get dog think ill kill myselfthe last thi...,1.0
60491,whats point princess bridei really think like ...,1.0


In [7]:
data_df['label'].value_counts()

label
0.0    38545
1.0    21943
Name: count, dtype: int64

In [8]:
def even_out_dataframe(df):

    counts = df['label'].value_counts()

    if counts[0] > counts[1]:
        desired_count = counts[1]
        label = 0
        non_label = 1
    else:
        desired_count = counts[0]
        label = 1
        non_label = 0

    df_balanced = pd.concat([df[df['label'] == label].sample(desired_count), df[df['label'] == non_label]], ignore_index=True)
    
    return df_balanced

In [9]:
balanced_df = even_out_dataframe(data_df)
print(balanced_df['label'].value_counts())

data_df = balanced_df

label
0.0    21943
1.0    21943
Name: count, dtype: int64


In [10]:
X, y = data_df['text'].values, data_df['label'].values

test_size = 0.2

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size= test_size)

print(f'shape of train data is {x_train.shape}')
print(f'shape of test data is {x_test.shape}')

shape of train data is (35108,)
shape of test data is (8778,)


In [14]:
from utils.common_functions import tockenize, padding_, preprocess_string

DICT_LENGTH = 100000
MAX_STR_LENGTH = 60

x_train,x_test,vocab = tockenize(x_train,x_test, DICT_LENGTH)
print(f'Length of vocabulary is {len(vocab)}')

x_train_pad = padding_(x_train,MAX_STR_LENGTH)
x_test_pad = padding_(x_test,MAX_STR_LENGTH)

35108
Length of vocabulary is 69939


In [15]:
x_train_pad[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     2,  3796,   375, 20562,  5503])

In [19]:
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))

# dataloaders
batch_size = 128

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

RuntimeError: Numpy is not available

In [11]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter._dataset[0]

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample input: \n', sample_y)

Sample input size:  torch.Size([60])
Sample input: 
 tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,   155,
          163,   341,   400,    16,    62,   331,  1442,   155,  8986,   838,
         2282,   402,   209,  3267, 28861,    84,  2210,    24,   163,   402,
          152,  3690,   400,   460,   111,  4886,    33, 16557,    63,   400],
       dtype=torch.int32)
Sample input: 
 tensor(0., dtype=torch.float64)


In [12]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    # Takes in 2 tensors

    preds, labels = preds.cpu().detach().numpy(), labels.cpu().detach().numpy()
    return f1_score(labels, preds, average = 'weighted')

In [13]:
def train_loop(train_loader, model, loss_fn, optimizer, device):
    model.train()

    size = len(train_loader.dataset)
    num_batches = len(train_loader)

    train_loss, train_correct = 0, 0

    for word_embed, labels in train_loader:
        # Transfering images and labels to GPU if available
        word_embed, labels = word_embed.to(device), labels.to(device)
        
        # Forward pass 
        outputs = model(word_embed)
        outputs = outputs.type(torch.float64)

        loss = loss_fn(outputs, labels)
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        predicted = torch.round(outputs)
        
        train_correct += (predicted == labels).type(torch.float).sum().item()

    train_f1 = f1_score_func(predicted, labels)
    train_loss /= num_batches
    train_correct /=size
    
    return train_loss, train_correct, train_f1

def test_loop(test_loader, model, loss_fn, device):
    model.eval()

    size = len(test_loader.dataset)
    num_batches = len(test_loader)
    test_loss, test_correct = 0, 0

    with torch.no_grad():
        for word_embed, labels in test_loader:

            word_embed, labels = word_embed.to(device), labels.to(device)

            outputs = model(word_embed)
            outputs = outputs.type(torch.float64)

            test_loss += loss_fn(outputs, labels).item()

            predicted = torch.round(outputs)
            test_correct += (predicted == labels).type(torch.float).sum().item()

    test_f1 = f1_score_func(predicted, labels)
    test_loss /= num_batches
    test_correct /= size
    
    return test_loss, test_correct, test_f1



In [14]:
from utils.models import SentimentRNN
from utils.early_stopper import EarlyStopper

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

no_layers = 2
vocab_size = len(vocab) + 1 #extra 1 for padding
embedding_dim = 64
hidden_dim = 256
patience = 5

model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim)
early_stopper = EarlyStopper(patience=patience, min_delta=0)

#moving to gpu
model.to(device)
print(model)

SentimentRNN(
  (embedding): Embedding(69741, 64)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc_extra): Linear(in_features=256, out_features=256, bias=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [15]:
#from Utils.common_functions import train_loop, test_loop

# loss and optimization functions
lr=0.001

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

epochs = 50
# train for some number of epochs
tr_acc, te_acc = [], []
tr_loss, te_loss = [], []

for epoch in range(epochs):

    train_loss, train_correct, train_f1 = train_loop(train_loader, model, loss_fn, optimizer, device)
    test_loss, test_correct, test_f1 = test_loop(test_loader, model, loss_fn, device)

    print('Train Acc epoch {} : Acc {} , F1 {} '.format(epoch, train_correct, train_f1))
    print('Test Acc epoch {} : Acc {} , F1 {} '.format(epoch, test_correct, test_f1))

    tr_acc.append(train_correct)
    te_acc.append(test_correct)

    tr_loss.append(train_loss)
    te_loss.append(test_loss)

    if early_stopper.early_stop(test_loss):
        print("Done! Early stopped at {}".format(epoch+1))
        break

Train Acc epoch 0 : Acc 0.829953287000114 , F1 0.8594923594923595 
Test Acc epoch 0 : Acc 0.8904078377762589 , F1 0.878489448352462 
Train Acc epoch 1 : Acc 0.8904238350233565 , F1 0.9722007722007722 
Test Acc epoch 1 : Acc 0.9035087719298246 , F1 0.9325195842608778 
Train Acc epoch 2 : Acc 0.921385439216133 , F1 0.9444444444444444 
Test Acc epoch 2 : Acc 0.9155844155844156 , F1 0.9451905321470538 
Train Acc epoch 3 : Acc 0.9348866355246668 , F1 0.8888888888888888 
Test Acc epoch 3 : Acc 0.918546365914787 , F1 0.9460249723407618 
Train Acc epoch 4 : Acc 0.9481029964680415 , F1 0.9722869722869724 
Test Acc epoch 4 : Acc 0.9260651629072681 , F1 0.9729333729333728 
Train Acc epoch 5 : Acc 0.959240059245756 , F1 0.9168609168609169 
Test Acc epoch 5 : Acc 0.9264069264069265 , F1 0.919037719037719 
Train Acc epoch 6 : Acc 0.9563347385211348 , F1 0.9722869722869724 
Test Acc epoch 6 : Acc 0.922077922077922 , F1 0.8918918918918919 
Train Acc epoch 7 : Acc 0.9650791842315142 , F1 0.944272445820

In [19]:
A = 'All I feel is anxiety'
B = 'naruto is a great anime'
C = 'I dont feel anything anymore'
D = 'Lets have a picnic today!'
E = 'I am wondering why there is so much talk about depression these days'

model.eval()

def predict_text(text):
    word_seq = np.array([vocab[preprocess_string(word)] for word in text.split() 
                        if preprocess_string(word) in vocab.keys()])
    word_seq = np.expand_dims(word_seq,axis=0)
    pad =  torch.from_numpy(padding_(word_seq,MAX_STR_LENGTH))
    inputs = pad.to(device)
    output = model(inputs)
    return(output.item())

print(predict_text(A))
print(predict_text(B))
print(predict_text(C))
print(predict_text(D))
print(predict_text(E))

0.9985145926475525
0.00047090023872442544
0.960693895816803
0.0005473597557283938
0.9996005892753601
