In [21]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
import pickle

import sys
sys.path.insert(0, '../preprocessing/') #need this in order to get to the other file in other directory

#can comment out the ones you aren't using to save a little bit of time
import covidPreprocess
from covidPreprocess import getCoronaVocabulary, getCoronaText, get_whole_Corona_dataset
from liarPreprocess import getLiarVocabulary, getLiarText
from fnnPreprocess import getFNNVocabulary, getFNNText

In [13]:
class SimpleNeuralNet(nn.Module):

    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleNeuralNet, self).__init__()
        #Written based off of the tutorial at
        #https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/01-basics/feedforward_neural_network/main.py#L37-L49
        self.hidden1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()   
        self.hOutput1 = nn.Linear(hidden_size, num_classes)  
        self.softmax = nn.Softmax(dim = 0)

    def forward(self, x):
        out = self.hidden1(x)
        out = self.relu(out)
        out = self.hOutput1(out)
        out = self.softmax(out)
        return out

In [5]:
def trainSimpleModel(dataset: str, num_epochs = 5, learning_rate = 0.001, print_epoch_mod = 5):
    '''
    gets around 63-71% for corona and Liar datasets, around 80-83% on FNN
    
    used this article for help in writing the tensor parts of code so it works with the model
    https://medium.com/analytics-vidhya/part-1-sentiment-analysis-in-pytorch-82b35edb40b8
    '''
    DEBUG_MODE = False

    torch.manual_seed(1)
    if dataset == 'corona':
        X,Y = getCoronaText() #this function will give us the text array (not document term matrix) and Y
        X_train,Y_train, vectorizer_train = getCoronaVocabulary(True)
    elif dataset == 'liar':
        X,Y = getLiarText()
        X_train,Y_train, vectorizer_train = getLiarVocabulary(True)
    elif dataset == 'fnn':
        X,Y = getFNNText()
        X_train,Y_train, vectorizer_train = getFNNVocabulary(True)
    
    #transform our testing dataset to match the vocabulary for the training dataset
    #transform will return the document-term matrix for X based on training dataset
    x_test = vectorizer_train.transform(X)
    
    #sample test on logistic classifier
    '''classifier = LogisticRegression()
    classifier.fit(X_train,Y_train)
    score = classifier.score(x_test,Y)
    print(score)'''
    
    vocabsize = X_train.shape[1]
    
    
    # transform our training and test data into tensors for the classifier to learn off of
    X_tensor = torch.from_numpy(X_train.todense()).float()
    Y_tensor = torch.from_numpy(np.array(Y_train))
    
    X_test_tensor = torch.from_numpy(x_test.todense()).float()
    Y_test_tensor = torch.from_numpy(np.array(Y))
    
    device = torch.device("cpu")
    # use TensorDataset to be able to use our DataLoader
    train_data = torch.utils.data.TensorDataset(X_tensor, Y_tensor)
    # train_loader = torch.utils.data.DataLoader(train_data,batch_size=16, shuffle=True)
    train_loader = torch.utils.data.DataLoader(train_data,batch_size=16, shuffle=False)
    
    test_data = torch.utils.data.TensorDataset(X_test_tensor, Y_test_tensor)
    # test_loader = torch.utils.data.DataLoader(test_data,batch_size=16, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_data,batch_size=16, shuffle=False)
    
    #initialize our model
    model = SimpleNeuralNet(vocabsize, 200, 2).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
    
    
    total_step = len(train_loader)
    for epoch in range(num_epochs):
        for i, (x_batch, labels) in enumerate(train_loader):
    
            # Forward pass
            # The forward process computes the loss of each iteration on each sample
            model.train()
            y_pred = model(x_batch)
            #need to transform labels to long datatype using .long() or it complains it's an int
            loss = criterion(y_pred, labels.long())
    
            # Backward pass, using the optimizer to update the parameters
            optimizer.zero_grad()
            loss.backward()    #compute gradients
            optimizer.step()   #initiate gradient descent
    
     
            # Below, an epoch corresponds to one pass through all of the samples.
            # Each training step corresponds to a parameter update using 
            # a gradient computed on a minibatch of 100 samples
            if DEBUG_MODE:
                if (i + 1) % print_epoch_mod == 0: 
                    # leaving it on 5 for corona dataset, probably want to change to % 50 or % 100
                    # for the other datasets so don't get spammed 
                    print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                        .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))

    return train_loader, test_loader, model

In [6]:
def train_simple_model_with_data(X, Y, num_epochs = 5, learning_rate = 0.001, print_epoch_mod = 5):
    '''
    gets around 63-71% for corona and Liar datasets, around 80-83% on FNN
    
    used this article for help in writing the tensor parts of code so it works with the model
    https://medium.com/analytics-vidhya/part-1-sentiment-analysis-in-pytorch-82b35edb40b8
    '''
    DEBUG_MODE = False

    torch.manual_seed(1)
    
    #sample test on logistic classifier
    '''classifier = LogisticRegression()
    classifier.fit(X_train,Y_train)
    score = classifier.score(x_test,Y)
    print(score)'''
    
    vocabsize = X_train.shape[1]
    
    
    # transform our training and test data into tensors for the classifier to learn off of
    X_tensor = torch.from_numpy(X_train.todense()).float()
    Y_tensor = torch.from_numpy(np.array(Y_train))
    
    X_test_tensor = torch.from_numpy(x_test.todense()).float()
    Y_test_tensor = torch.from_numpy(np.array(Y))
    
    device = torch.device("cpu")
    # use TensorDataset to be able to use our DataLoader
    train_data = torch.utils.data.TensorDataset(X_tensor, Y_tensor)
    # train_loader = torch.utils.data.DataLoader(train_data,batch_size=16, shuffle=True)
    train_loader = torch.utils.data.DataLoader(train_data,batch_size=16, shuffle=False)
    
    test_data = torch.utils.data.TensorDataset(X_test_tensor, Y_test_tensor)
    # test_loader = torch.utils.data.DataLoader(test_data,batch_size=16, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_data,batch_size=16, shuffle=False)
    
    #initialize our model
    model = SimpleNeuralNet(vocabsize, 200, 2).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
    
    
    total_step = len(train_loader)
    for epoch in range(num_epochs):
        for i, (x_batch, labels) in enumerate(train_loader):
    
            # Forward pass
            # The forward process computes the loss of each iteration on each sample
            model.train()
            y_pred = model(x_batch)
            #need to transform labels to long datatype using .long() or it complains it's an int
            loss = criterion(y_pred, labels.long())
    
            # Backward pass, using the optimizer to update the parameters
            optimizer.zero_grad()
            loss.backward()    #compute gradients
            optimizer.step()   #initiate gradient descent
    
     
            # Below, an epoch corresponds to one pass through all of the samples.
            # Each training step corresponds to a parameter update using 
            # a gradient computed on a minibatch of 100 samples
            if DEBUG_MODE:
                if (i + 1) % print_epoch_mod == 0: 
                    # leaving it on 5 for corona dataset, probably want to change to % 50 or % 100
                    # for the other datasets so don't get spammed 
                    print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                        .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))

    return train_loader, test_loader, model

In [7]:
def testModel(train_loader, test_loader, model, debug=False):
    # Test the model
    # In the test phase, we don't need to compute gradients (the model has already been learned)
    train_accuracy = 0
    test_accuracy = 0
    k = 5

    with torch.no_grad():
        total = 0
        correct = 0

        for data, labels in train_loader:
            if debug:
                print('data:', data)
                print('data shape:', data.shape) # size of train data set
                print('label:', labels)
                print('label shape:', labels.shape)

            outputs = model(data)
            
            if debug:
                print('outputs:', outputs)
                print('outputs data:', outputs.data)

            _, predicted = torch.max(outputs.data, 1)

            if debug:
                print('predicted:', predicted)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            if debug:
                print('label size:', labels.size(0))
                print('correct labels:', (predicted == labels).sum().item())
                break

        train_accuracy = correct / total

        total = 0
        correct = 0

        for data, labels in test_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        test_accuracy = correct / total

        print("train accuracy: {:.4f}%".format(train_accuracy * 100))
        print("test accuracy: {:.4f}%".format(test_accuracy * 100))
        print("difference in accuracies: {:.4f}%".format(abs(test_accuracy - train_accuracy) * 100))

        return train_accuracy, test_accuracy

In [22]:
def save_model_and_vectorizer():
    X, Y, vectorizer_train = get_whole_Corona_dataset()

    print(type(X))

save_model_and_vectorizer()

KeyError: 1164

In [9]:
train_loader, test_loader, model = trainSimpleModel('corona', num_epochs=10)
testModel(train_loader, test_loader, model, debug=False)

train accuracy: 74.5704%
test accuracy: 67.4685%
difference in accuracies: 7.1019%


(0.7457044673539519, 0.6746849942726232)

In [10]:
train_loader, test_loader, model = trainSimpleModel('corona', num_epochs=20)
testModel(train_loader, test_loader, model, debug=False)

train accuracy: 83.1615%
test accuracy: 74.6850%
difference in accuracies: 8.4765%


(0.8316151202749141, 0.7468499427262314)

In [11]:
train_loader, test_loader, model = trainSimpleModel('corona', num_epochs=30)
testModel(train_loader, test_loader, model, debug=False)

train accuracy: 97.9381%
test accuracy: 83.5052%
difference in accuracies: 14.4330%


(0.979381443298969, 0.8350515463917526)

In [12]:
trainAndTestSimpleModel('corona', num_epochs=30)

NameError: name 'trainAndTestSimpleModel' is not defined

In [9]:
trainAndTestSimpleModel('corona', num_epochs=40)


Extracting tokens....
there are 70 nan titles
there are 9 nan text

Extracting tokens....
there are 12 nan titles
there are 1 nan text
Data shape for text:  (291, 7417)
Epoch [1/40], Step [5/19], Loss: 0.6801
Epoch [1/40], Step [10/19], Loss: 0.6651
Epoch [1/40], Step [15/19], Loss: 0.6412
Epoch [2/40], Step [5/19], Loss: 0.6423
Epoch [2/40], Step [10/19], Loss: 0.6454
Epoch [2/40], Step [15/19], Loss: 0.6420
Epoch [3/40], Step [5/19], Loss: 0.6454
Epoch [3/40], Step [10/19], Loss: 0.6450
Epoch [3/40], Step [15/19], Loss: 0.6443
Epoch [4/40], Step [5/19], Loss: 0.6441
Epoch [4/40], Step [10/19], Loss: 0.6418
Epoch [4/40], Step [15/19], Loss: 0.6399
Epoch [5/40], Step [5/19], Loss: 0.6408
Epoch [5/40], Step [10/19], Loss: 0.6418
Epoch [5/40], Step [15/19], Loss: 0.6444
Epoch [6/40], Step [5/19], Loss: 0.6413
Epoch [6/40], Step [10/19], Loss: 0.6404
Epoch [6/40], Step [15/19], Loss: 0.6418
Epoch [7/40], Step [5/19], Loss: 0.6445
Epoch [7/40], Step [10/19], Loss: 0.6443
Epoch [7/40], Ste

In [10]:
trainAndTestSimpleModel('corona', num_epochs=50)


Extracting tokens....
there are 70 nan titles
there are 9 nan text

Extracting tokens....
there are 12 nan titles
there are 1 nan text
Data shape for text:  (291, 7417)
Epoch [1/50], Step [5/19], Loss: 0.6801
Epoch [1/50], Step [10/19], Loss: 0.6651
Epoch [1/50], Step [15/19], Loss: 0.6412
Epoch [2/50], Step [5/19], Loss: 0.6423
Epoch [2/50], Step [10/19], Loss: 0.6454
Epoch [2/50], Step [15/19], Loss: 0.6420
Epoch [3/50], Step [5/19], Loss: 0.6454
Epoch [3/50], Step [10/19], Loss: 0.6450
Epoch [3/50], Step [15/19], Loss: 0.6443
Epoch [4/50], Step [5/19], Loss: 0.6441
Epoch [4/50], Step [10/19], Loss: 0.6418
Epoch [4/50], Step [15/19], Loss: 0.6399
Epoch [5/50], Step [5/19], Loss: 0.6408
Epoch [5/50], Step [10/19], Loss: 0.6418
Epoch [5/50], Step [15/19], Loss: 0.6444
Epoch [6/50], Step [5/19], Loss: 0.6413
Epoch [6/50], Step [10/19], Loss: 0.6404
Epoch [6/50], Step [15/19], Loss: 0.6418
Epoch [7/50], Step [5/19], Loss: 0.6445
Epoch [7/50], Step [10/19], Loss: 0.6443
Epoch [7/50], Ste

In [11]:
trainAndTestSimpleModel('corona', num_epochs=60)


Extracting tokens....
there are 70 nan titles
there are 9 nan text

Extracting tokens....
there are 12 nan titles
there are 1 nan text
Data shape for text:  (291, 7417)
Epoch [1/60], Step [5/19], Loss: 0.6801
Epoch [1/60], Step [10/19], Loss: 0.6651
Epoch [1/60], Step [15/19], Loss: 0.6412
Epoch [2/60], Step [5/19], Loss: 0.6423
Epoch [2/60], Step [10/19], Loss: 0.6454
Epoch [2/60], Step [15/19], Loss: 0.6420
Epoch [3/60], Step [5/19], Loss: 0.6454
Epoch [3/60], Step [10/19], Loss: 0.6450
Epoch [3/60], Step [15/19], Loss: 0.6443
Epoch [4/60], Step [5/19], Loss: 0.6441
Epoch [4/60], Step [10/19], Loss: 0.6418
Epoch [4/60], Step [15/19], Loss: 0.6399
Epoch [5/60], Step [5/19], Loss: 0.6408
Epoch [5/60], Step [10/19], Loss: 0.6418
Epoch [5/60], Step [15/19], Loss: 0.6444
Epoch [6/60], Step [5/19], Loss: 0.6413
Epoch [6/60], Step [10/19], Loss: 0.6404
Epoch [6/60], Step [15/19], Loss: 0.6418
Epoch [7/60], Step [5/19], Loss: 0.6445
Epoch [7/60], Step [10/19], Loss: 0.6443
Epoch [7/60], Ste

In [12]:
trainAndTestSimpleModel('corona', num_epochs=75)


Extracting tokens....
there are 70 nan titles
there are 9 nan text

Extracting tokens....
there are 12 nan titles
there are 1 nan text
Data shape for text:  (291, 7417)
Epoch [1/75], Step [5/19], Loss: 0.6801
Epoch [1/75], Step [10/19], Loss: 0.6651
Epoch [1/75], Step [15/19], Loss: 0.6412
Epoch [2/75], Step [5/19], Loss: 0.6423
Epoch [2/75], Step [10/19], Loss: 0.6454
Epoch [2/75], Step [15/19], Loss: 0.6420
Epoch [3/75], Step [5/19], Loss: 0.6454
Epoch [3/75], Step [10/19], Loss: 0.6450
Epoch [3/75], Step [15/19], Loss: 0.6443
Epoch [4/75], Step [5/19], Loss: 0.6441
Epoch [4/75], Step [10/19], Loss: 0.6418
Epoch [4/75], Step [15/19], Loss: 0.6399
Epoch [5/75], Step [5/19], Loss: 0.6408
Epoch [5/75], Step [10/19], Loss: 0.6418
Epoch [5/75], Step [15/19], Loss: 0.6444
Epoch [6/75], Step [5/19], Loss: 0.6413
Epoch [6/75], Step [10/19], Loss: 0.6404
Epoch [6/75], Step [15/19], Loss: 0.6418
Epoch [7/75], Step [5/19], Loss: 0.6445
Epoch [7/75], Step [10/19], Loss: 0.6443
Epoch [7/75], Ste

In [13]:
trainAndTestSimpleModel('corona', num_epochs=100)


Extracting tokens....
there are 70 nan titles
there are 9 nan text

Extracting tokens....
there are 12 nan titles
there are 1 nan text
Data shape for text:  (291, 7417)
Epoch [1/100], Step [5/19], Loss: 0.6801
Epoch [1/100], Step [10/19], Loss: 0.6651
Epoch [1/100], Step [15/19], Loss: 0.6412
Epoch [2/100], Step [5/19], Loss: 0.6423
Epoch [2/100], Step [10/19], Loss: 0.6454
Epoch [2/100], Step [15/19], Loss: 0.6420
Epoch [3/100], Step [5/19], Loss: 0.6454
Epoch [3/100], Step [10/19], Loss: 0.6450
Epoch [3/100], Step [15/19], Loss: 0.6443
Epoch [4/100], Step [5/19], Loss: 0.6441
Epoch [4/100], Step [10/19], Loss: 0.6418
Epoch [4/100], Step [15/19], Loss: 0.6399
Epoch [5/100], Step [5/19], Loss: 0.6408
Epoch [5/100], Step [10/19], Loss: 0.6418
Epoch [5/100], Step [15/19], Loss: 0.6444
Epoch [6/100], Step [5/19], Loss: 0.6413
Epoch [6/100], Step [10/19], Loss: 0.6404
Epoch [6/100], Step [15/19], Loss: 0.6418
Epoch [7/100], Step [5/19], Loss: 0.6445
Epoch [7/100], Step [10/19], Loss: 0.64

In [18]:
trainAndTestSimpleModel('liar', num_epochs=5, print_epoch_mod=100)


Extracting tokens....

Extracting tokens....
Data shape for text:  (15052, 4893)
Epoch [1/5], Step [100/941], Loss: 0.7026
Epoch [1/5], Step [200/941], Loss: 0.6900
Epoch [1/5], Step [300/941], Loss: 0.6436
Epoch [1/5], Step [400/941], Loss: 0.6444
Epoch [1/5], Step [500/941], Loss: 0.6436
Epoch [1/5], Step [600/941], Loss: 0.6892
Epoch [1/5], Step [700/941], Loss: 0.7038
Epoch [1/5], Step [800/941], Loss: 0.6828
Epoch [1/5], Step [900/941], Loss: 0.6434
Epoch [2/5], Step [100/941], Loss: 0.7045
Epoch [2/5], Step [200/941], Loss: 0.6598
Epoch [2/5], Step [300/941], Loss: 0.6456
Epoch [2/5], Step [400/941], Loss: 0.6402
Epoch [2/5], Step [500/941], Loss: 0.6967
Epoch [2/5], Step [600/941], Loss: 0.6460
Epoch [2/5], Step [700/941], Loss: 0.7015
Epoch [2/5], Step [800/941], Loss: 0.6424
Epoch [2/5], Step [900/941], Loss: 0.6411
Epoch [3/5], Step [100/941], Loss: 0.6596
Epoch [3/5], Step [200/941], Loss: 0.6411
Epoch [3/5], Step [300/941], Loss: 0.6426
Epoch [3/5], Step [400/941], Loss: 0

In [19]:
trainAndTestSimpleModel('liar', num_epochs=10, print_epoch_mod=100)


Extracting tokens....

Extracting tokens....
Data shape for text:  (15052, 4893)
Epoch [1/10], Step [100/941], Loss: 0.7026
Epoch [1/10], Step [200/941], Loss: 0.6900
Epoch [1/10], Step [300/941], Loss: 0.6436
Epoch [1/10], Step [400/941], Loss: 0.6444
Epoch [1/10], Step [500/941], Loss: 0.6436
Epoch [1/10], Step [600/941], Loss: 0.6892
Epoch [1/10], Step [700/941], Loss: 0.7038
Epoch [1/10], Step [800/941], Loss: 0.6828
Epoch [1/10], Step [900/941], Loss: 0.6434
Epoch [2/10], Step [100/941], Loss: 0.7045
Epoch [2/10], Step [200/941], Loss: 0.6598
Epoch [2/10], Step [300/941], Loss: 0.6456
Epoch [2/10], Step [400/941], Loss: 0.6402
Epoch [2/10], Step [500/941], Loss: 0.6967
Epoch [2/10], Step [600/941], Loss: 0.6460
Epoch [2/10], Step [700/941], Loss: 0.7015
Epoch [2/10], Step [800/941], Loss: 0.6424
Epoch [2/10], Step [900/941], Loss: 0.6411
Epoch [3/10], Step [100/941], Loss: 0.6596
Epoch [3/10], Step [200/941], Loss: 0.6411
Epoch [3/10], Step [300/941], Loss: 0.6426
Epoch [3/10], S