In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import string
import time

import sklearn
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,ExtraTreesClassifier
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import f1_score

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import spacy

In [2]:
mat =np.asarray([]) #emo
mat1 =np.asarray([]) #senti
data = pd.read_csv('audio_6373_features.csv')
data = np.asarray(data)

In [3]:
output = data[:,4:6] #emotions #sentiments

In [4]:
inp = data[:,13:] #input-features

In [5]:
inp.shape

(13837, 6373)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(inp, output, test_size=0.2, shuffle = True)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

X_train_copy1 = X_train.copy()
X_test_copy1 = X_test.copy()

# emotion (7)
y_train_emo = y_train[:,0] *1.0
y_train_emo = y_train_emo.astype('float')
y_test_emo = y_test[:,0] *1.0
y_test_emo = y_test_emo.astype('float')

# sentiment(3)
y_train_senti = y_train[:,1] *1.0
y_train_senti = y_train_senti.astype('float')
y_test_senti = y_test[:,1] *1.0
y_test_senti = y_test_senti.astype('float')

In [9]:
def random_forest(X_train, y_train, X_test, y_test):
    start = time.time()
    print("Random Forest Classifier")
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    accuracy_train_data = cross_val_score(rf, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))

    # predicting test set results
    y_pred = rf.predict(X_test)
    accuracy_test_data = cross_val_score(rf, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    # making the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    f1 = f1_score(y_test, y_pred, average='micro')
    print("F1-Score : ",f1)
    print("Time Taken :", (time.time()-start))
    return np.mean(accuracy_train_data), np.std(accuracy_train_data), np.mean(accuracy_test_data), np.std(accuracy_test_data), f1, time.time()-start


print("FOR EMOTIONS (7): ")
tup = random_forest(X_train, y_train_emo, X_test, y_test_emo)
d = np.asarray(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup= random_forest(X_train, y_train_senti, X_test, y_test_senti)
d = np.asarray(tup)
mat1 = np.append(mat1,d)

FOR EMOTIONS (7): 
Random Forest Classifier
Training Accuracy :  0.4742073507495129
Testing Accuracy :  0.4877204091891292
[[1031   91    0   15  140    0   60]
 [ 213   39    0    4   37    0   20]
 [  45    6    0    1    3    0    9]
 [ 142   22    0    7   17    0   10]
 [ 344   27    0    3   49    0   40]
 [  57    6    0    1    7    0    8]
 [ 193   30    0    4   35    0   52]]
F1-Score :  0.42557803468208094
Time Taken : 359.60492420196533

FOR SENTIMENTS (3): 
Random Forest Classifier
Training Accuracy :  0.5070915845474164
Testing Accuracy :  0.507587102839125
[[567  88 682]
 [213  62 341]
 [253  51 511]]
F1-Score :  0.41184971098265893
Time Taken : 298.87330412864685


In [10]:
mat =np.asarray([]) #emo
mat1 =np.asarray([]) #senti
data = pd.read_csv('audio_6373_features.csv')

In [11]:
df1 = pd.read_csv('data1.csv')
df2 = pd.read_csv('data2.csv')
df3 = pd.read_csv('data3.csv')
df4 = pd.read_csv('data4.csv')

frames = [df1,df2,df3,df4]
data_t = pd.concat(frames) #complete dataset

mat =np.asarray([]) #emo
mat1 =np.asarray([]) #senti
data_text = np.asarray(data_t[['1','3','4']])

In [12]:
data = data.rename(columns={"string": "id"})

In [13]:
data_t = data_t.rename(columns={"11": "id"})

In [14]:
mergeddata = data_t.merge(data, how='inner', on='id')

In [15]:
mergeddata = np.array(mergeddata)

In [16]:
nlp = spacy.load('en_core_web_lg')

total_vectors = len(nlp.vocab.vectors)
print('Total word vectors:', total_vectors)
doc_glove_vectors = np.array([nlp(str(doc)).vector for doc in mergeddata[:,2]])
X_glove = np.zeros((doc_glove_vectors.shape[0], 300))
for i in range(doc_glove_vectors.shape[0]):
    if (doc_glove_vectors[i].shape[0] == 300):
        X_glove[i,:] = doc_glove_vectors[i][:]
    else:
        print(i)

Total word vectors: 684830


In [17]:
from sklearn.decomposition import PCA

In [18]:
pca = PCA(n_components=300, svd_solver='full')
X_pca = pca.fit_transform(mergeddata[:,-6373:].copy())
print(sum(pca.explained_variance_ratio_))

0.9999999999999996


In [19]:
feat_combined = np.concatenate((scaler.fit_transform(X_glove), scaler.fit_transform(X_pca)), axis = 1)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(feat_combined, mergeddata[:,4:6], test_size=0.2, shuffle = True)
# emotion (7)
y_train_emo = y_train[:,0] *1.0
y_train_emo = y_train_emo.astype('float')
y_test_emo = y_test[:,0] *1.0
y_test_emo = y_test_emo.astype('float')

# sentiment(3)
y_train_senti = y_train[:,1] *1.0
y_train_senti = y_train_senti.astype('float')
y_test_senti = y_test[:,1] *1.0
y_test_senti = y_test_senti.astype('float')

In [21]:
def random_forest(X_train, y_train, X_test, y_test):
    start = time.time()
    print("Random Forest Classifier")
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    accuracy_train_data = cross_val_score(rf, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))

    # predicting test set results
    y_pred = rf.predict(X_test)
    accuracy_test_data = cross_val_score(rf, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    # making the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    f1 = f1_score(y_test, y_pred, average='micro')
    print("F1-Score : ",f1)
    print("Time Taken :", (time.time()-start))
    return np.mean(accuracy_train_data), np.std(accuracy_train_data), np.mean(accuracy_test_data), np.std(accuracy_test_data), f1, time.time()-start


print("FOR EMOTIONS (7): ")
tup = random_forest(X_train, y_train_emo, X_test, y_test_emo)
d = np.asarray(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup= random_forest(X_train, y_train_senti, X_test, y_test_senti)
d = np.asarray(tup)
mat1 = np.append(mat1,d)

FOR EMOTIONS (7): 
Random Forest Classifier
Training Accuracy :  0.5638271999529756
Testing Accuracy :  0.548413314967261
[[1257   16    0    0   26    0    5]
 [ 154   95    0    0   41    0   20]
 [  56    3    3    0    8    0    2]
 [ 177    5    0    6    9    0    7]
 [ 253    9    1    0  175    0   16]
 [  63    5    0    0    6    1    5]
 [ 223   19    0    1   55    0   46]]
F1-Score :  0.571893063583815
Time Taken : 93.20202088356018

FOR SENTIMENTS (3): 
Random Forest Classifier
Training Accuracy :  0.6171286040319359
Testing Accuracy :  0.6072907214341206
[[1193   39   72]
 [ 249  213  150]
 [ 460   62  330]]
F1-Score :  0.6271676300578035
Time Taken : 79.05316805839539


In [32]:
def sv_classifier(X_train, y_train, X_test, y_test):
    start = time.time()
    print("Support Vector Classifier")
    clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
    clf.fit(X_train, y_train)
    accuracy_train_data = cross_val_score(clf, X_train,y_train,cv=5)
    print("Training Accuracy : ",np.mean(accuracy_train_data))
    
    # predicting test set results
    y_pred = clf.predict(X_test)
    accuracy_test_data = cross_val_score(clf, X_test,y_test,cv=5)
    print("Testing Accuracy : ",np.mean(accuracy_test_data))
    # making the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='micro')
    print("F1-Score : ",f1)
    print(cm)
    print("Time Taken :", (time.time()-start))
    return np.mean(accuracy_train_data), np.std(accuracy_train_data), np.mean(accuracy_test_data), np.std(accuracy_test_data), f1, time.time()-start

print("FOR EMOTIONS (7): ")
tup = sv_classifier(X_train, y_train_emo, X_test, y_test_emo)
d = np.asarray(tup)
mat = np.append(mat,d)
print("\nFOR SENTIMENTS (3): ")
tup = sv_classifier(X_train, y_train_senti, X_test, y_test_senti)
d = np.asarray(tup)
mat1 = np.append(mat1,d)

FOR EMOTIONS (7): 
Support Vector Classifier
Training Accuracy :  0.5904780856815949
Testing Accuracy :  0.5567172168872119
F1-Score :  0.5863439306358381
[[1199   16    0    1   36    0   21]
 [ 120  130    0    0   39    0   24]
 [  54    5    1    0    5    0   14]
 [ 176   10    0   10   10    0   15]
 [ 239   22    0    0  199    0   37]
 [  46    7    0    0    3    1    8]
 [ 157   28    0    2   50    0   83]]
Time Taken : 231.63975143432617

FOR SENTIMENTS (3): 
Support Vector Classifier
Training Accuracy :  0.6527238037857107
Testing Accuracy :  0.6062116058780136
F1-Score :  0.6488439306358381
[[1133   35  105]
 [ 238  259  146]
 [ 375   73  404]]
Time Taken : 193.7278118133545
