In [13]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import string
import time

import sklearn
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,ExtraTreesClassifier
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import spacy

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saisi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# Input - Utterance or Column '1'
# Emotion(7) - Output 1 and Column '3'
# Sentiment(3) - Output 2 and Column '4'
df1 = pd.read_csv('data1.csv')
df2 = pd.read_csv('data2.csv')
df3 = pd.read_csv('data3.csv')
df4 = pd.read_csv('data4.csv')

frames = [df1,df2,df3,df4]
data = pd.concat(frames) #complete dataset

mat =np.asarray([]) #emo
mat1 =np.asarray([]) #senti
data = np.asarray(data[['1','3','4']])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(data[:,0], data[:,1:3], test_size=0.2, shuffle = True)

X_train_copy1 = X_train.copy()
X_test_copy1 = X_test.copy()

nlp = spacy.load('en_core_web_lg')

total_vectors = len(nlp.vocab.vectors)
print('Total word vectors:', total_vectors)

y_train_emo = y_train[:,0] *1.0
y_train_emo = y_train_emo.astype('int')
y_test_emo = y_test[:,0] *1.0
y_test_emo = y_test_emo.astype('int')

y_train_senti = y_train[:,1] *1.0
y_train_senti = y_train_senti.astype('int')
y_test_senti = y_test[:,1] *1.0
y_test_senti = y_test_senti.astype('int')

Total word vectors: 684830


In [17]:
doc_glove_vectors = np.array([nlp(str(doc)).vector for doc in X_train_copy1])
X_train_glove = np.zeros((doc_glove_vectors.shape[0], 300))
for i in range(doc_glove_vectors.shape[0]):
    if (doc_glove_vectors[i].shape[0] == 300):
        X_train_glove[i,:] = doc_glove_vectors[i][:]
    else:
        print(i)
doc_glove_vectors2 = np.array([nlp(str(doc)).vector for doc in X_test_copy1])
X_test_glove = np.zeros((doc_glove_vectors2.shape[0], 300))
for i in range(doc_glove_vectors2.shape[0]):
    if (doc_glove_vectors2[i].shape[0] == 300):
        X_test_glove[i,:] = doc_glove_vectors2[i][:]
    else:
        print(i)

In [27]:
scaler = MinMaxScaler()
def logistic_regression(X_train, y_train, X_test, y_test):
    print("Multinomial Logistic Regression")
    start = time.time()
    logreg = LogisticRegression(multi_class='multinomial', max_iter = 1e4)
    logreg.fit(X_train, y_train)
    print("Time Taken :", (time.time()-start))
    accuracy_train_data = cross_val_score(logreg, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))
    accuracy_test_data = cross_val_score(logreg, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    y_pred = logreg.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    f1 = f1_score(y_test, y_pred, average='micro')
    print("F1-Score : ",f1)
    print("Time Taken :", (time.time()-start))
    return np.mean(accuracy_train_data), np.std(accuracy_train_data), np.mean(accuracy_test_data), np.std(accuracy_test_data), f1, time.time()-start

print("FOR EMOTIONS (7): ")
tup = logistic_regression(scaler.fit_transform(X_train_glove), y_train_emo, scaler.fit_transform(X_test_glove), y_test_emo)
d = np.asarray(tup)
print(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup = logistic_regression(scaler.fit_transform(X_train_glove), y_train_senti, scaler.fit_transform(X_test_glove), y_test_senti)
d = np.asarray(tup)
print(tup)
mat1 = np.append(mat1,d)

FOR EMOTIONS (7): 
Multinomial Logistic Regression
Training Accuracy :  0.5920117503234997
Testing Accuracy :  0.5594458404797042
[[1005   38    3    2   31    3  172]
 [  94  130    2    0    4    0  108]
 [  40    3    6    0    1    0   23]
 [ 120    4    2    9    1    0   64]
 [ 188   18    0    0   74    0  217]
 [  29    3    0    1    1    4   37]
 [  81   16    3    0    3    0  202]]
F1-Score :  0.5215171407731582
Time Taken : 33.82595920562744
(0.5920117503234997, 0.006148802035596928, 0.5594458404797042, 0.007143008751892763, 0.5215171407731582, 33.82595920562744)

FOR SENTIMENTS (3): 
Multinomial Logistic Regression
Training Accuracy :  0.6484584455138398
Testing Accuracy :  0.6079454349647002
[[914  31 309]
 [203 123 340]
 [229  12 581]]
F1-Score :  0.5900802334062728
Time Taken : 19.58925151824951
(0.6484584455138398, 0.008410712348025163, 0.6079454349647002, 0.013639857048245019, 0.5900802334062728, 19.58925151824951)


In [28]:
scaler = MinMaxScaler()
def naive_bayes(X_train, y_train, X_test, y_test):
    start = time.time()
    print("Multinomial Naive Bayes")
    classifier = MultinomialNB();
    classifier.fit(X_train, y_train)
    accuracy_train_data = cross_val_score(classifier, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))
    
    # predicting test set results
    y_pred = classifier.predict(X_test)
    accuracy_test_data = cross_val_score(classifier, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    
    # making the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print("Time Taken :", (time.time()-start))
    f1 = f1_score(y_test, y_pred, average='micro')
    print("F1-Score : ",f1)
    return np.mean(accuracy_train_data), np.std(accuracy_train_data), np.mean(accuracy_test_data), np.std(accuracy_test_data), f1, time.time()-start


print("FOR EMOTIONS (7): ")
tup = naive_bayes(scaler.fit_transform(X_train_glove), y_train_emo, scaler.fit_transform(X_test_glove), y_test_emo)
d = np.asarray(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup = naive_bayes(scaler.fit_transform(X_train_glove), y_train_senti, scaler.fit_transform(X_test_glove), y_test_senti)
d = np.asarray(tup)
mat1 = np.append(mat1,d)

FOR EMOTIONS (7): 
Multinomial Naive Bayes
Training Accuracy :  0.4969906734820871
Testing Accuracy :  0.48576908247244494
[[1219   12    0    0   23    0    0]
 [ 262   38    0    0   38    0    0]
 [  70    2    0    0    1    0    0]
 [ 196    1    0    0    3    0    0]
 [ 429    1    0    0   67    0    0]
 [  73    0    0    0    2    0    0]
 [ 287    1    0    0   17    0    0]]
Time Taken : 0.10944962501525879
F1-Score :  0.4828592268417214

FOR SENTIMENTS (3): 
Multinomial Naive Bayes
Training Accuracy :  0.522980553439073
Testing Accuracy :  0.5196887506149203
[[1215   23   16]
 [ 513   98   55]
 [ 691   47   84]]
Time Taken : 0.10347843170166016
F1-Score :  0.5094821298322393


In [21]:
def random_forest(X_train, y_train, X_test, y_test):
    start = time.time()
    print("Random Forest Classifier")
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    accuracy_train_data = cross_val_score(rf, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))

    # predicting test set results
    y_pred = rf.predict(X_test)
    accuracy_test_data = cross_val_score(rf, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    # making the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    f1 = f1_score(y_test, y_pred, average='micro')
    print("F1-Score : ",f1)
    print("Time Taken :", (time.time()-start))
    return np.mean(accuracy_train_data), np.std(accuracy_train_data), np.mean(accuracy_test_data), np.std(accuracy_test_data), f1, time.time()-start



print("FOR EMOTIONS (7): ")
tup = random_forest(X_train_glove, y_train_emo, X_test_glove, y_test_emo)
d = np.asarray(tup)
print(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup= random_forest(X_train_glove, y_train_senti, X_test_glove, y_test_senti)
d = np.asarray(tup)
mat1 = np.append(mat1,d)
print(tup)

FOR EMOTIONS (7): 
Random Forest Classifier
Training Accuracy :  0.5719501970511127
Testing Accuracy :  0.5419402230997301
[[1182   24    0    4   34    0   10]
 [ 166  117    0    0   44    0   11]
 [  57    4    1    0    6    0    5]
 [ 159    8    0    9   16    0    8]
 [ 290   11    0    0  181    1   14]
 [  54    3    0    0   10    0    8]
 [ 184   14    0    1   51    0   55]]
F1-Score :  0.563457330415755
Time Taken : 60.72662568092346
(0.5719501970511127, 0.0061161177068875545, 0.5419402230997301, 0.006024677876830581, 0.563457330415755, 60.72662568092346)

FOR SENTIMENTS (3): 
Random Forest Classifier
Training Accuracy :  0.6346892677912359
Testing Accuracy :  0.5988293247178014
[[1108   38  108]
 [ 289  215  162]
 [ 407   86  329]]
F1-Score :  0.6024799416484318
Time Taken : 54.4398512840271
(0.6346892677912359, 0.0035942320133563617, 0.5988293247178014, 0.012945255007082053, 0.6024799416484318, 54.4398512840271)


In [None]:
def sv_classifier(X_train, y_train, X_test, y_test):
    start = time.time()
    print("Support Vector Classifier")
    clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
    clf.fit(X_train, y_train)
    accuracy_train_data = cross_val_score(clf, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))
    
    # predicting test set results
    y_pred = clf.predict(X_test)
    accuracy_test_data = cross_val_score(clf, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    # making the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    f1 = f1_score(y_test, y_pred, average='micro')
    print("F1-Score : ",f1)
    print("Time Taken :", (time.time()-start))
    return np.mean(accuracy_train_data), np.std(accuracy_train_data), np.mean(accuracy_test_data), np.std(accuracy_test_data), f1, time.time()-start


print("FOR EMOTIONS (7): ")
tup = sv_classifier(X_train_glove, y_train_emo, X_test_glove, y_test_emo)
print(tup)
d = np.asarray(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup = sv_classifier(X_train_glove, y_train_senti, X_test_glove, y_test_senti)
d = np.asarray(tup)
print(tup)
mat1 = np.append(mat1,d)

FOR EMOTIONS (7): 
Support Vector Classifier
Training Accuracy :  0.6077877692384112
Testing Accuracy :  0.5762155478441227
[[1165   23    0    2   43    0   21]
 [ 138  134    0    1   47    0   18]
 [  51    4    2    2    4    0   10]
 [ 139    8    0   17   18    0   18]
 [ 229   15    0    0  213    0   40]
 [  47    8    0    1    5    0   14]
 [ 138   21    0    2   49    0   95]]
F1-Score :  0.5929978118161926
Time Taken : 100.86223578453064
(0.6077877692384112, 0.0050097755970676386, 0.5762155478441227, 0.011786676730148283, 0.5929978118161926, 100.86223578453064)

FOR SENTIMENTS (3): 
Support Vector Classifier
Training Accuracy :  0.6668797836490599
Testing Accuracy :  0.6269062529084068
[[1100   43  111]
 [ 248  273  145]
 [ 322   78  422]]
F1-Score :  0.6546316557257477
Time Taken : 89.78336358070374
(0.6668797836490599, 0.006719201969387679, 0.6269062529084068, 0.011485915978125406, 0.6546316557257477, 89.7843656539917)


In [7]:
Y_train_senti = np.zeros((y_train_senti.shape[0], 3))
for i in range(y_train_senti.shape[0]):
    if y_train_senti[i] == 0:
        Y_train_senti[i,:] = [1, 0, 0]
    if y_train_senti[i] == 1:
        Y_train_senti[i,:] = [0, 1, 0]
    if y_train_senti[i] == 2:
        Y_train_senti[i,:] = [0, 0, 1]

In [8]:
Y_test_senti = np.zeros((y_test_senti.shape[0], 3))
for i in range(y_test_senti.shape[0]):
    if y_test_senti[i] == 0:
        Y_test_senti[i,:] = [1, 0, 0]
    if y_test_senti[i] == 1:
        Y_test_senti[i,:] = [0, 1, 0]
    if y_test_senti[i] == 2:
        Y_test_senti[i,:] = [0, 0, 1]

In [50]:
print(X_test_glove.shape)

(2742, 300)


In [76]:
import numpy as np
import scipy.io
import torch
import torchvision
from torch import nn, optim
import torch.nn.functional as F  # a lower level (compared to torch.nn) interface
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
from time import time
from torch.utils.data import TensorDataset, DataLoader


torch.manual_seed(123)
device = torch.device("cpu")

train_x, train_y = X_train_glove.astype(np.float32), Y_train_senti.astype(np.int)
valid_x, valid_y = X_test_glove.astype(np.float32), Y_test_senti.astype(np.int)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.fc1 = nn.Linear(300, 3000)
        self.fc2 = nn.Linear(3000, 2000)
        self.fc3 = nn.Linear(2000, 1000)
        self.fc4 = nn.Linear(1000, 500)
        self.fc5 = nn.Linear(500, 300)
        self.fc6 = nn.Linear(300, 3)
#         self.fc5 = nn.Linear(2000, 1000)
#         self.fc6 = nn.Linear(1000, 500)
#         self.fc7 = nn.Linear(500, 3)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc3(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc4(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc5(x))
        x = F.dropout(x, training=self.training)
        x = self.fc6(x)
        return x
        

model = Net()
optimizer = optim.SGD(model.parameters(), lr=0.0005, momentum=0.9)

givendata_train = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
givendata_test = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
trainset_loader = DataLoader(givendata_train, batch_size=64, shuffle=True, num_workers=1)
validset_loader = DataLoader(givendata_test, batch_size=64, shuffle=True, num_workers=1)


def train(max_iters):
    model.train()
    Taccuracies = []
    Tlosses = []
    for itr in range(max_iters):
        correct = 0
        Tloss = 0
        num = 0
        for batch_idx, (data, target) in enumerate(trainset_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            target = torch.max(target, 1)[1]
            loss = nn.functional.cross_entropy(output, target)            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            Tloss = Tloss + loss.item()

            pred = torch.max(output, 1)[1] 
            correct += pred.eq(target).sum().item()
            num = num + 1
        
        Taccuracies.append(100. * correct / (num*64))
        Tlosses.append(Tloss/num)
        if itr % 50 == 0:
            print('Accuracy {:.2f} %'.format(100. * correct / (num*64)))
            print('Loss: {:.6f}'.format(Tloss/num))
            
    return Taccuracies, Tlosses

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [18]:
import numpy as np
import scipy.io
import torch
import torchvision
from torch import nn, optim
import torch.nn.functional as F  # a lower level (compared to torch.nn) interface
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
from time import time
from torch.utils.data import TensorDataset, DataLoader
from torchsummary import summary


torch.manual_seed(123)
device = torch.device("cpu")

train_x, train_y = X_train_glove.astype(np.float32), Y_train_senti.astype(np.int)
valid_x, valid_y = X_test_glove.astype(np.float32), Y_test_senti.astype(np.int)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.fc1 = nn.Linear(300, 3000)
        self.fc2 = nn.Linear(3000, 2000)
        self.fc3 = nn.Linear(2000, 1000)
        self.fc4 = nn.Linear(1000, 500)
        self.fc5 = nn.Linear(500, 300)
        self.fc6 = nn.Linear(300, 3)
#         self.fc5 = nn.Linear(2000, 1000)
#         self.fc6 = nn.Linear(1000, 500)
#         self.fc7 = nn.Linear(500, 3)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc3(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc4(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc5(x))
        x = F.dropout(x, training=self.training)
        x = self.fc6(x)
        return x
        

model = Net()
optimizer = optim.SGD(model.parameters(), lr=0.0005, momentum=0.9)
print(summary(model))

givendata_train = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
givendata_test = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
trainset_loader = DataLoader(givendata_train, batch_size=64, shuffle=True, num_workers=1)
validset_loader = DataLoader(givendata_test, batch_size=64, shuffle=True, num_workers=1)

Layer (type:depth-idx)                   Param #
├─Linear: 1-1                            903,000
├─Linear: 1-2                            6,002,000
├─Linear: 1-3                            2,001,000
├─Linear: 1-4                            500,500
├─Linear: 1-5                            150,300
├─Linear: 1-6                            903
Total params: 9,557,703
Trainable params: 9,557,703
Non-trainable params: 0
Layer (type:depth-idx)                   Param #
├─Linear: 1-1                            903,000
├─Linear: 1-2                            6,002,000
├─Linear: 1-3                            2,001,000
├─Linear: 1-4                            500,500
├─Linear: 1-5                            150,300
├─Linear: 1-6                            903
Total params: 9,557,703
Trainable params: 9,557,703
Non-trainable params: 0


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [77]:
Taccuracies, Tlosses = train(300)

Accuracy 42.81 %
Loss: 1.088486
Accuracy 47.00 %
Loss: 1.005750
Accuracy 59.87 %
Loss: 0.849135
Accuracy 61.63 %
Loss: 0.822001
Accuracy 66.51 %
Loss: 0.757556
Accuracy 74.02 %
Loss: 0.615821


In [78]:
model.eval()
Taccuracies = []
Tlosses = []
with torch.no_grad():
    correct = 0
    Tloss = 0
    num = 0
    for batch_idx, (data, target) in enumerate(validset_loader):
        data, target = data.to(device), target.to(device)
        output = model(data)
        target = torch.max(target, 1)[1]
        loss = nn.functional.cross_entropy(output, target)            
        Tloss = Tloss + loss.item()

        pred = torch.max(output, 1)[1] 
        correct += pred.eq(target).sum().item()
        num = num + 1

    Taccuracies.append(100. * correct / (num*64))
    Tlosses.append(Tloss/num)
    print('Accuracy {:.2f} %'.format(100. * correct / (num*64)))
    print('Loss: {:.6f}'.format(Tloss/num))

Accuracy 62.03 %
Loss: 1.054303


In [22]:
Y_train_emo = np.zeros((y_train_emo.shape[0], 7))
for i in range(y_train_emo.shape[0]):
    if y_train_emo[i] == 0:
        Y_train_emo[i,:] = [1, 0, 0, 0, 0, 0, 0]
    if y_train_emo[i] == 1:
        Y_train_emo[i,:] = [0, 1, 0, 0, 0, 0, 0]
    if y_train_emo[i] == 2:
        Y_train_emo[i,:] = [0, 0, 1, 0, 0, 0, 0]
    if y_train_emo[i] == 3:
        Y_train_emo[i,:] = [0, 0, 0, 1, 0, 0, 0]
    if y_train_emo[i] == 4:
        Y_train_emo[i,:] = [0, 0, 0, 0, 1, 0, 0]
    if y_train_emo[i] == 5:
        Y_train_emo[i,:] = [0, 0, 0, 0, 0, 1, 0]
    if y_train_emo[i] == 6:
        Y_train_emo[i,:] = [0, 0, 0, 0, 0, 0, 1]
        
        
Y_test_emo = np.zeros((y_test_emo.shape[0], 7))
for i in range(y_test_emo.shape[0]):
    if y_test_emo[i] == 0:
        Y_test_emo[i,:] = [1, 0, 0, 0, 0, 0, 0]
    if y_test_emo[i] == 1:
        Y_test_emo[i,:] = [0, 1, 0, 0, 0, 0, 0]
    if y_test_emo[i] == 2:
        Y_test_emo[i,:] = [0, 0, 1, 0, 0, 0, 0]
    if y_test_emo[i] == 3:
        Y_test_emo[i,:] = [0, 0, 0, 1, 0, 0, 0]
    if y_test_emo[i] == 4:
        Y_test_emo[i,:] = [0, 0, 0, 0, 1, 0, 0]
    if y_test_emo[i] == 5:
        Y_test_emo[i,:] = [0, 0, 0, 0, 0, 1, 0]
    if y_test_emo[i] == 6:
        Y_test_emo[i,:] = [0, 0, 0, 0, 0, 0, 1]

In [25]:
import numpy as np
import scipy.io
import torch
import torchvision
from torch import nn, optim
import torch.nn.functional as F  # a lower level (compared to torch.nn) interface
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
from time import time
from torch.utils.data import TensorDataset, DataLoader


torch.manual_seed(123)
device = torch.device("cpu")

train_x, train_y = X_train_glove.astype(np.float32), Y_train_emo.astype(np.int)
valid_x, valid_y = X_test_glove.astype(np.float32), Y_test_emo.astype(np.int)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.fc1 = nn.Linear(300, 3000)
        self.fc2 = nn.Linear(3000, 2000)
        self.fc3 = nn.Linear(2000, 1000)
        self.fc4 = nn.Linear(1000, 500)
        self.fc5 = nn.Linear(500, 300)
        self.fc6 = nn.Linear(300, 7)
#         self.fc5 = nn.Linear(2000, 1000)
#         self.fc6 = nn.Linear(1000, 500)
#         self.fc7 = nn.Linear(500, 3)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc3(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc4(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc5(x))
        x = F.dropout(x, training=self.training)
        x = self.fc6(x)
        return x
        

model = Net()
optimizer = optim.SGD(model.parameters(), lr=0.0005, momentum=0.9)

givendata_train = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
givendata_test = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
trainset_loader = DataLoader(givendata_train, batch_size=64, shuffle=True, num_workers=1)
validset_loader = DataLoader(givendata_test, batch_size=64, shuffle=True, num_workers=1)


def train(max_iters):
    model.train()
    Taccuracies = []
    Tlosses = []
    for itr in range(max_iters):
        correct = 0
        Tloss = 0
        num = 0
        for batch_idx, (data, target) in enumerate(trainset_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            target = torch.max(target, 1)[1]
            loss = nn.functional.cross_entropy(output, target)            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            Tloss = Tloss + loss.item()

            pred = torch.max(output, 1)[1] 
            correct += pred.eq(target).sum().item()
            num = num + 1
        
        Taccuracies.append(100. * correct / (num*64))
        Tlosses.append(Tloss/num)
        if itr % 50 == 0:
            print('Accuracy {:.2f} %'.format(100. * correct / (num*64)))
            print('Loss: {:.6f}'.format(Tloss/num))
            
    return Taccuracies, Tlosses

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [26]:
Taccuracies, Tlosses = train(300)

Accuracy 46.02 %
Loss: 1.864360
Accuracy 52.52 %
Loss: 1.395798
Accuracy 54.28 %
Loss: 1.322285
Accuracy 54.38 %
Loss: 1.290819
Accuracy 55.13 %
Loss: 1.239628
Accuracy 59.96 %
Loss: 1.098693


In [27]:
model.eval()
Taccuracies = []
Tlosses = []
with torch.no_grad():
    correct = 0
    Tloss = 0
    num = 0
    for batch_idx, (data, target) in enumerate(validset_loader):
        data, target = data.to(device), target.to(device)
        output = model(data)
        target = torch.max(target, 1)[1]
        loss = nn.functional.cross_entropy(output, target)            
        Tloss = Tloss + loss.item()

        pred = torch.max(output, 1)[1] 
        correct += pred.eq(target).sum().item()
        num = num + 1

    Taccuracies.append(100. * correct / (num*64))
    Tlosses.append(Tloss/num)
    print('Accuracy {:.2f} %'.format(100. * correct / (num*64)))
    print('Loss: {:.6f}'.format(Tloss/num))

Accuracy 54.98 %
Loss: 1.431657


### Feature Engineering ---- Seems useless for GLOVE vectors

In [23]:
def punctuation_remover(x):
    X_t1 = x
    for i in range(len(x)):
        X_t1[i] = [char for char in x[i] if char not in string.punctuation]
        X_t1[i]=''.join(X_t1[i])
    return X_t1

#All useless punctuations are removed
X_tr1 = punctuation_remover(X_train_copy1)
X_te1 = punctuation_remover(X_test_copy1)

doc_glove_vectors = np.array([nlp(str(doc)).vector for doc in X_tr1])
X_train_glove = np.zeros((doc_glove_vectors.shape[0], 300))
for i in range(doc_glove_vectors.shape[0]):
    if (doc_glove_vectors[i].shape[0] == 300):
        X_train_glove[i,:] = doc_glove_vectors[i][:]
    else:
        print(i)
doc_glove_vectors2 = np.array([nlp(str(doc)).vector for doc in X_te1])
X_test_glove = np.zeros((doc_glove_vectors2.shape[0], 300))
for i in range(doc_glove_vectors2.shape[0]):
    if (doc_glove_vectors2[i].shape[0] == 300):
        X_test_glove[i,:] = doc_glove_vectors2[i][:]
    else:
        print(i)

In [24]:
def logistic_regression(X_train, y_train, X_test, y_test):
    print("Multinomial Logistic Regression")
    start = time.time()
    logreg = LogisticRegression(multi_class='multinomial', max_iter = 1e4)
    logreg.fit(X_train, y_train)
    accuracy_train_data = cross_val_score(logreg, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))
    accuracy_test_data = cross_val_score(logreg, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    y_pred = logreg.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print("Time Taken :", (time.time()-start))
    return np.mean(accuracy_train_data), np.mean(accuracy_test_data), time.time()-start

print("FOR EMOTIONS (7): ")
tup = logistic_regression(X_train_glove, y_train_emo, X_test_glove, y_test_emo)
d = np.asarray(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup = logistic_regression(X_train_glove, y_train_senti, X_test_glove, y_test_senti)
d = np.asarray(tup)
mat1 = np.append(mat1,d)

FOR EMOTIONS (7): 
Multinomial Logistic Regression
Training Accuracy :  0.5116734234767872
Testing Accuracy :  0.4894207118450268
[[1116   37    1   19   50    3   23]
 [ 196  110    0    9   14    0   15]
 [  54    3    2    3    1    0    4]
 [ 153    8    0   17    5    1   10]
 [ 366   13    1    6   92    1   12]
 [  58    6    0    2    0    6    7]
 [ 258   13    1    4   10    2   30]]
Time Taken : 16.00754189491272

FOR SENTIMENTS (3): 
Multinomial Logistic Regression
Training Accuracy :  0.5519780556431939
Testing Accuracy :  0.5321001688537885
[[975  96 178]
 [362 180 125]
 [433  44 349]]
Time Taken : 9.973003625869751


In [25]:
def random_forest(X_train, y_train, X_test, y_test):
    start = time.time()
    print("Random Forest Classifier")
    rf = RandomForestClassifier(random_state=42, n_estimators  = 180)
    rf.fit(X_train, y_train)
    accuracy_train_data = cross_val_score(rf, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))

    # predicting test set results
    y_pred = rf.predict(X_test)
    accuracy_test_data = cross_val_score(rf, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    # making the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print("Time Taken :", (time.time()-start))
    return np.mean(accuracy_train_data), np.mean(accuracy_test_data), time.time()-start


print("FOR EMOTIONS (7): ")
tup = random_forest(X_train_glove, y_train_emo, X_test_glove, y_test_emo)
d = np.asarray(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup= random_forest(X_train_glove, y_train_senti, X_test_glove, y_test_senti)
d = np.asarray(tup)
mat1 = np.append(mat1,d)

FOR EMOTIONS (7): 
Random Forest Classifier
Training Accuracy :  0.5100308805551433
Testing Accuracy :  0.48067488333133895
[[1208   12    2    2   20    0    5]
 [ 250   77    1    1    9    1    5]
 [  60    4    1    0    0    0    2]
 [ 180    5    1    6    1    1    0]
 [ 411    7    1    2   69    0    1]
 [  71    2    1    1    1    2    1]
 [ 295    5    0    1    3    0   14]]
Time Taken : 151.55781745910645

FOR SENTIMENTS (3): 
Random Forest Classifier
Training Accuracy :  0.5430423145493596
Testing Accuracy :  0.5120364830547911
[[1092   37  120]
 [ 452  136   79]
 [ 544   22  260]]
Time Taken : 130.1093897819519


In [26]:
def sv_classifier(X_train, y_train, X_test, y_test):
    start = time.time()
    print("Support Vector Classifier")
    clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
    clf.fit(X_train, y_train)
    accuracy_train_data = cross_val_score(clf, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))
    
    # predicting test set results
    y_pred = clf.predict(X_test)
    accuracy_test_data = cross_val_score(clf, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    # making the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print("Time Taken :", (time.time()-start))
    return np.mean(accuracy_train_data), np.mean(accuracy_test_data), time.time()-start

print("FOR EMOTIONS (7): ")
tup = sv_classifier(X_train_glove, y_train_emo, X_test_glove, y_test_emo)
d = np.asarray(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup = sv_classifier(X_train_glove, y_train_senti, X_test_glove, y_test_senti)
d = np.asarray(tup)
mat1 = np.append(mat1,d)

FOR EMOTIONS (7): 
Support Vector Classifier
Training Accuracy :  0.5186937304866192
Testing Accuracy :  0.49599005491072023
[[1218    9    0    3   17    0    2]
 [ 238   94    0    0    9    0    3]
 [  62    3    0    1    0    0    1]
 [ 171    8    0   13    1    1    0]
 [ 396    7    1    2   84    0    1]
 [  71    4    0    1    0    1    2]
 [ 294    6    0    1    6    0   11]]
Time Taken : 128.2013020515442

FOR SENTIMENTS (3): 
Support Vector Classifier
Training Accuracy :  0.561918027900991
Testing Accuracy :  0.5466913964341271
[[1068   50  131]
 [ 389  173  105]
 [ 460   27  339]]
Time Taken : 115.29567837715149


In [27]:
StopWords = set(stopwords.words('english'))
#print(StopWords)

#train2 and test2 after stopwords removed 
def stopwords_remover(x):
    for i in range(len(x)):
        x[i] = [word for word in x[i].split() if word.lower() not in StopWords]
        
    a = ""
    for i in range(len(x)):
        for word in x[i]:
            #print(word)
            a = a +word +" "
        x[i]=a
        a=""
    return x

X_tr1 = stopwords_remover(X_train_copy1)
X_te1 = stopwords_remover(X_test_copy1)

doc_glove_vectors = np.array([nlp(str(doc)).vector for doc in X_tr1])
X_train_glove = np.zeros((doc_glove_vectors.shape[0], 300))
for i in range(doc_glove_vectors.shape[0]):
    if (doc_glove_vectors[i].shape[0] == 300):
        X_train_glove[i,:] = doc_glove_vectors[i][:]
    else:
        print(i)
doc_glove_vectors2 = np.array([nlp(str(doc)).vector for doc in X_te1])
X_test_glove = np.zeros((doc_glove_vectors2.shape[0], 300))
for i in range(doc_glove_vectors2.shape[0]):
    if (doc_glove_vectors2[i].shape[0] == 300):
        X_test_glove[i,:] = doc_glove_vectors2[i][:]
    else:
        print(i)

In [28]:
def logistic_regression(X_train, y_train, X_test, y_test):
    print("Multinomial Logistic Regression")
    start = time.time()
    logreg = LogisticRegression(multi_class='multinomial', max_iter = 1e4)
    logreg.fit(X_train, y_train)
    accuracy_train_data = cross_val_score(logreg, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))
    accuracy_test_data = cross_val_score(logreg, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    y_pred = logreg.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print("Time Taken :", (time.time()-start))
    return np.mean(accuracy_train_data), np.mean(accuracy_test_data), time.time()-start

print("FOR EMOTIONS (7): ")
tup = logistic_regression(X_train_glove, y_train_emo, X_test_glove, y_test_emo)
d = np.asarray(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup = logistic_regression(X_train_glove, y_train_senti, X_test_glove, y_test_senti)
d = np.asarray(tup)
mat1 = np.append(mat1,d)

FOR EMOTIONS (7): 
Multinomial Logistic Regression
Training Accuracy :  0.5041042165737424
Testing Accuracy :  0.45697419329105343
[[1115   36    2   19   44    4   29]
 [ 213   81    1    8   18    1   22]
 [  56    3    0    3    0    0    5]
 [ 151    6    0   17    6    3   11]
 [ 365   15    1    7   96    1    6]
 [  55    6    1    3    0    7    7]
 [ 242   11    3    6   18    5   33]]
Time Taken : 14.356157779693604

FOR SENTIMENTS (3): 
Multinomial Logistic Regression
Training Accuracy :  0.543223382927613
Testing Accuracy :  0.5233523460040154
[[1003   87  159]
 [ 386  183   98]
 [ 448   60  318]]
Time Taken : 8.695868253707886


In [29]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

porter_stemmer = PorterStemmer()

def stem_sentences(x):
    sen_list = []
    for sentence in x:
        tokens = sentence.split()
#         print(tokens)
        stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
#         print(stemmed_tokens)
        stemmed_string =  ' '.join(stemmed_tokens)
        sen_list.append(stemmed_string)
    sen_list_arr = np.asarray(sen_list)
    return sen_list_arr

X_tr1 = stem_sentences(X_train_copy1)
X_te1 = stem_sentences(X_test_copy1)

doc_glove_vectors = np.array([nlp(str(doc)).vector for doc in X_tr1])
X_train_glove = np.zeros((doc_glove_vectors.shape[0], 300))
for i in range(doc_glove_vectors.shape[0]):
    if (doc_glove_vectors[i].shape[0] == 300):
        X_train_glove[i,:] = doc_glove_vectors[i][:]
    else:
        print(i)
doc_glove_vectors2 = np.array([nlp(str(doc)).vector for doc in X_te1])
X_test_glove = np.zeros((doc_glove_vectors2.shape[0], 300))
for i in range(doc_glove_vectors2.shape[0]):
    if (doc_glove_vectors2[i].shape[0] == 300):
        X_test_glove[i,:] = doc_glove_vectors2[i][:]
    else:
        print(i)

In [30]:
def logistic_regression(X_train, y_train, X_test, y_test):
    print("Multinomial Logistic Regression")
    start = time.time()
    logreg = LogisticRegression(multi_class='multinomial', max_iter = 1e4)
    logreg.fit(X_train, y_train)
    accuracy_train_data = cross_val_score(logreg, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))
    accuracy_test_data = cross_val_score(logreg, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    y_pred = logreg.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print("Time Taken :", (time.time()-start))
    return np.mean(accuracy_train_data), np.mean(accuracy_test_data), time.time()-start

print("FOR EMOTIONS (7): ")
tup = logistic_regression(X_train_glove, y_train_emo, X_test_glove, y_test_emo)
d = np.asarray(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup = logistic_regression(X_train_glove, y_train_senti, X_test_glove, y_test_senti)
d = np.asarray(tup)
mat1 = np.append(mat1,d)

FOR EMOTIONS (7): 
Multinomial Logistic Regression
Training Accuracy :  0.5017330771107706
Testing Accuracy :  0.45369085131559705
[[1112   28    3   22   54    4   26]
 [ 227   80    1    7   14    1   14]
 [  55    4    0    3    0    0    5]
 [ 157    7    1   16    6    5    2]
 [ 373   15    1    8   83    2    9]
 [  57    7    0    2    1    7    5]
 [ 261    8    2    5   15    2   25]]
Time Taken : 13.163366556167603

FOR SENTIMENTS (3): 
Multinomial Logistic Regression
Training Accuracy :  0.5399411652473416
Testing Accuracy :  0.4817790807440203
[[998  91 160]
 [400 176  91]
 [487  54 285]]
Time Taken : 7.963275194168091


In [31]:
X_tr1 = stem_sentences(stopwords_remover(punctuation_remover(X_train_copy1)))
X_te1 = stem_sentences(stopwords_remover(punctuation_remover(X_test_copy1)))

doc_glove_vectors = np.array([nlp(str(doc)).vector for doc in X_tr1])
X_train_glove = np.zeros((doc_glove_vectors.shape[0], 300))
for i in range(doc_glove_vectors.shape[0]):
    if (doc_glove_vectors[i].shape[0] == 300):
        X_train_glove[i,:] = doc_glove_vectors[i][:]
    else:
        print(i)
doc_glove_vectors2 = np.array([nlp(str(doc)).vector for doc in X_te1])
X_test_glove = np.zeros((doc_glove_vectors2.shape[0], 300))
for i in range(doc_glove_vectors2.shape[0]):
    if (doc_glove_vectors2[i].shape[0] == 300):
        X_test_glove[i,:] = doc_glove_vectors2[i][:]
    else:
        print(i)

In [32]:
def logistic_regression(X_train, y_train, X_test, y_test):
    print("Multinomial Logistic Regression")
    start = time.time()
    logreg = LogisticRegression(multi_class='multinomial', max_iter = 1e4)
    logreg.fit(X_train, y_train)
    accuracy_train_data = cross_val_score(logreg, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))
    accuracy_test_data = cross_val_score(logreg, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    y_pred = logreg.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print("Time Taken :", (time.time()-start))
    return np.mean(accuracy_train_data), np.mean(accuracy_test_data), time.time()-start

print("FOR EMOTIONS (7): ")
tup = logistic_regression(X_train_glove, y_train_emo, X_test_glove, y_test_emo)
d = np.asarray(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup = logistic_regression(X_train_glove, y_train_senti, X_test_glove, y_test_senti)
d = np.asarray(tup)
mat1 = np.append(mat1,d)

FOR EMOTIONS (7): 
Multinomial Logistic Regression
Training Accuracy :  0.5017330771107706
Testing Accuracy :  0.45369085131559705
[[1112   28    3   22   54    4   26]
 [ 227   80    1    7   14    1   14]
 [  55    4    0    3    0    0    5]
 [ 157    7    1   16    6    5    2]
 [ 373   15    1    8   83    2    9]
 [  57    7    0    2    1    7    5]
 [ 261    8    2    5   15    2   25]]
Time Taken : 13.201356887817383

FOR SENTIMENTS (3): 
Multinomial Logistic Regression
Training Accuracy :  0.5399411652473416
Testing Accuracy :  0.4817790807440203
[[998  91 160]
 [400 176  91]
 [487  54 285]]
Time Taken : 7.921770334243774


In [33]:
def sv_classifier(X_train, y_train, X_test, y_test):
    start = time.time()
    print("Support Vector Classifier")
    clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
    clf.fit(X_train, y_train)
    accuracy_train_data = cross_val_score(clf, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))
    
    # predicting test set results
    y_pred = clf.predict(X_test)
    accuracy_test_data = cross_val_score(clf, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    # making the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print("Time Taken :", (time.time()-start))
    return np.mean(accuracy_train_data), np.mean(accuracy_test_data), time.time()-start

print("FOR EMOTIONS (7): ")
tup = sv_classifier(X_train_glove, y_train_emo, X_test_glove, y_test_emo)
d = np.asarray(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup = sv_classifier(X_train_glove, y_train_senti, X_test_glove, y_test_senti)
d = np.asarray(tup)
mat1 = np.append(mat1,d)

FOR EMOTIONS (7): 
Support Vector Classifier
Training Accuracy :  0.5169609859164883
Testing Accuracy :  0.48067355377394866
[[1206    7    1    7   22    1    5]
 [ 268   58    0    4    8    0    6]
 [  62    3    0    2    0    0    0]
 [ 168    6    0   14    3    3    0]
 [ 406    8    0    5   71    1    0]
 [  69    3    0    0    1    2    4]
 [ 291    7    0    1    8    1   10]]
Time Taken : 133.3147828578949

FOR SENTIMENTS (3): 
Support Vector Classifier
Training Accuracy :  0.550245976154342
Testing Accuracy :  0.5215222102562057
[[1077   41  131]
 [ 426  152   89]
 [ 520   31  275]]
Time Taken : 104.24002289772034
