In [2]:
#import libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.ensemble import StackingClassifier
from sklearn.metrics.cluster import contingency_matrix
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [3]:
#open the data
train = pd.read_csv('EmotionTrain.csv', encoding = 'utf-8')
test = pd.read_csv('EmotionTest.csv', encoding = 'utf-8')

In [4]:
#retrieve the input and target
with open('EmotionTrainText.pickle', 'rb') as f:
    X_train = pickle.load(f)
y_train = train['Emotions']
with open('EmotionTestText.pickle', 'rb') as f:
    X_test = pickle.load(f)
y_test = test['Emotions']

In [5]:
#open the lexicon
with open('emotionScores.pickle', 'rb') as f:
    scores = pickle.load(f)

In [6]:
#count maximum number of words in X_train
count = []
for i in X_train.index:
    count.append(len(X_train[i]))
maximum = max(count)

In [7]:
#function to transform the text into a vector of size 3: [Positive, Negative, Objective]
def transform(x):
    score = []
    for i in x.index:
        s = np.zeros((maximum, 5))
        for j in range(len(x[i])):
            if x[i][j] in scores['Armenian'].values:
                ind = np.where(scores['Armenian'] == x[i][j])[0]
                afraid, angry, happy, inspired, sad = 0, 0, 0, 0, 0
                for k in range(len(ind)):
                    afraid += scores['Afraid'][ind[k]]
                    angry += scores['Angry'][ind[k]]
                    happy += scores['Happy'][ind[k]]
                    inspired += scores['Inspired'][ind[k]]
                    sad += scores['Sad'][ind[k]]
                s[j][0] = afraid / len(ind)
                s[j][1] = angry / len(ind)
                s[j][2] = happy / len(ind)
                s[j][3] = inspired / len(ind)
                s[j][4] = sad / len(ind)
        score.append(s)
    return score

In [8]:
train_score = transform(X_train)
test_score = transform(X_test)

In [9]:
#open Armenian glove
words = []
embedding_index = {}

with open(r'glove.hy.300.txt', 'r', encoding = 'utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        words.append(word)
        coefs = np.asarray(values[1:], 'float32')
        embedding_index[word] = coefs

In [10]:
t = Tokenizer()
t.fit_on_texts(words)
vocab_size = len(t.word_index) + 1
encoded_docs1 = t.texts_to_sequences(X_train)
encoded_X_train = pad_sequences(encoded_docs1, maxlen = maximum, padding = 'post')
encoded_docs2 = t.texts_to_sequences(X_test)
encoded_X_test = pad_sequences(encoded_docs2, maxlen = maximum, padding = 'post')

In [11]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None and len(embedding_vector) == 300:
        embedding_matrix[i][:300] = embedding_vector

In [12]:
training_matrix_1 = np.zeros((len(X_train), maximum, 300))
training_matrix_2 = np.zeros((len(X_train), maximum, 5))
testing_matrix_1 = np.zeros((len(X_test), maximum, 300))
testing_matrix_2 = np.zeros((len(X_test), maximum, 5))

for i in range(len(X_train)):
    for j in range(maximum):
        training_matrix_1[i][j] = embedding_matrix[encoded_X_train[i][j]]
    for k in range(maximum):
        training_matrix_2[i][k] = train_score[i][k]
train_1 = training_matrix_1.reshape(len(X_train), maximum * 300)
train_2 = training_matrix_2.reshape(len(X_train), maximum * 5)
train = np.concatenate((train_1, train_2), axis = 1)

for i in range(len(X_test)):
    for j in range(maximum):
        testing_matrix_1[i][j] = embedding_matrix[encoded_X_test[i][j]]
    for k in range(maximum):
        testing_matrix_2[i][k] = test_score[i][k]
test_1 = testing_matrix_1.reshape(len(X_test), maximum * 300)
test_2 = testing_matrix_2.reshape(len(X_test), maximum * 5)
test = np.concatenate((test_1, test_2), axis = 1)

In [13]:
def search_embedding(clf, parameters):
    classifier = RandomizedSearchCV(clf, parameters, cv = 5)
    clf_search = classifier.fit(train[:, :maximum * 300], list(y_train))
    return clf_search.best_estimator_

In [14]:
def search_emotion(clf, parameters):
    classifier = RandomizedSearchCV(clf, parameters, cv = 5)
    clf_search = classifier.fit(train[:, maximum * 300:], list(y_train))
    return clf_search.best_estimator_

In [15]:
training = pd.DataFrame(train, index = y_train.index)

In [16]:
def train_cv_embedding(clf):
    kf = StratifiedKFold(10)
    for m, _ in kf.split(training[range(maximum * 300)], y_train):
        clf.fit(training[range(maximum * 300)].loc[training[range(maximum * 300)].index.intersection(m)], y_train.loc[y_train.index.intersection(m)])
    y = clf.predict(test[:, range(maximum * 300)])
    print(y)
    print('Accuracy:', accuracy_score(y_test, y))
    print('F-measure:', f1_score(y_test, y, average = None))
    print('Recall:', recall_score(y_test, y, average = None))
    print('Precision:', precision_score(y_test, y, average = None))
    print('Contingency:', contingency_matrix(y_test, y))
    return clf

In [17]:
def train_cv_emotion(clf):
    kf = StratifiedKFold(10)
    for m, _ in kf.split(training[range(maximum * 300, maximum * 305)], y_train):
        clf.fit(training[range(maximum * 300, maximum * 305)].loc[training[range(maximum * 300, maximum * 305)].index.intersection(m)], y_train.loc[y_train.index.intersection(m)])
    y = clf.predict(test[:, range(maximum * 300, maximum * 305)])
    print(y)
    print('Accuracy:', accuracy_score(y_test, y))
    print('F-measure:', f1_score(y_test, y, average = None))
    print('Recall:', recall_score(y_test, y, average = None))
    print('Precision:', precision_score(y_test, y, average = None))
    print('Contingency:', contingency_matrix(y_test, y))
    return clf

In [18]:
sv_parameters = {'tol': [0.0001, 0.001, 0.01, 0.1, 1],
                 'C': np.logspace(-4, 4, 20),
                 'kernel': ['poly', 'rbf', 'sigmoid']}
lr_parameters = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                 'C': np.logspace(-4, 4, 20),
                 'tol': np.linspace(0, 10, 100)}
sv_embedding = SVC()
lr_emotion = LogisticRegression()

In [19]:
sv_embedding_best = search_embedding(sv_embedding, sv_parameters)
print(sv_embedding_best)

SVC(C=3792.690190732246, tol=0.0001)


In [20]:
lr_emotion_best = search_emotion(lr_emotion, lr_parameters)
print(lr_emotion_best)

LogisticRegression(C=4.281332398719396, tol=5.555555555555555)


35 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------------------

In [21]:
sv_embedding_trained = train_cv_embedding(sv_embedding_best)

['joy' 'joy' 'anger' 'joy' 'joy' 'anger' 'anger' 'fear' 'anger' 'sadness'
 'joy' 'joy' 'joy' 'joy' 'anger' 'joy' 'anger' 'joy' 'joy' 'anger' 'anger'
 'anger' 'sadness' 'anger' 'joy' 'joy' 'anger' 'anger' 'anger' 'anger'
 'anger' 'anger' 'joy' 'joy' 'anger' 'joy' 'sadness' 'anger' 'sadness'
 'joy' 'sadness' 'joy' 'joy' 'anger' 'joy' 'joy' 'anger' 'joy' 'joy' 'joy'
 'anger' 'joy' 'fear' 'joy' 'surprise' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy'
 'joy' 'anger' 'anger' 'anger' 'joy' 'joy' 'joy' 'anger' 'joy' 'joy' 'joy'
 'joy' 'anger' 'joy' 'anger' 'joy' 'anger' 'anger' 'anger' 'anger' 'joy'
 'joy' 'anger' 'fear' 'anger' 'joy' 'joy' 'anger' 'anger' 'joy' 'joy'
 'joy' 'joy' 'joy' 'anger' 'joy' 'joy' 'anger' 'fear' 'joy' 'anger' 'joy'
 'joy' 'joy' 'joy' 'anger' 'anger' 'joy' 'anger' 'anger' 'joy' 'joy' 'joy'
 'joy' 'anger' 'sadness' 'joy' 'anger' 'anger' 'joy' 'sadness' 'joy' 'joy'
 'anger' 'joy' 'sadness' 'joy' 'joy' 'joy' 'joy' 'joy' 'anger' 'anger'
 'anger' 'fear' 'joy' 'anger' 'anger' 'joy' 'a

In [22]:
lr_emotion_trained = train_cv_emotion(lr_emotion_best)

['anger' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy'
 'joy' 'anger' 'joy' 'joy' 'anger' 'joy' 'joy' 'anger' 'joy' 'joy' 'joy'
 'joy' 'joy' 'joy' 'joy' 'anger' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy'
 'anger' 'anger' 'anger' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy'
 'anger' 'joy' 'joy' 'joy' 'joy' 'joy' 'anger' 'joy' 'joy' 'joy' 'joy'
 'joy' 'joy' 'joy' 'anger' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy'
 'joy' 'anger' 'joy' 'anger' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy'
 'joy' 'joy' 'joy' 'anger' 'joy' 'anger' 'joy' 'joy' 'joy' 'joy' 'joy'
 'joy' 'anger' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy'
 'joy' 'joy' 'joy' 'anger' 'joy' 'joy' 'joy' 'anger' 'joy' 'joy' 'joy'
 'joy' 'anger' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'anger'
 'joy' 'joy' 'joy' 'joy' 'joy' 'anger' 'joy' 'joy' 'joy' 'anger' 'joy'
 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'anger' 'joy' 'joy' 'joy'
 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy' 'joy

  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
estimators = [('sv', sv_embedding_trained), ('lr', lr_emotion_trained)]
final = StackingClassifier(estimators)
final.fit(training, y_train)
y_pred = final.predict(test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [24]:
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average = None)
rec = recall_score(y_test, y_pred, average = None)
pre = precision_score(y_test, y_pred, average = None)
cont = contingency_matrix(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
print('Accuracy:', acc)
print('F-measure:', f1)
print('Recall:', rec)
print('Precision:', pre)
print('Contingency:', cont)

Accuracy: 0.6353790613718412
F-measure: [0.56291391 0.24242424 0.7819063  0.29565217 0.        ]
Recall: [0.63909774 0.18604651 0.82312925 0.21794872 0.        ]
Precision: [0.50295858 0.34782609 0.74461538 0.45945946 0.        ]
Contingency: [[ 85   5  36   7]
 [ 23   8   7   5]
 [ 40   4 242   8]
 [ 19   6  36  17]
 [  2   0   4   0]]


In [26]:
y_test.values

array(['anger', 'joy', 'anger', 'joy', 'fear', 'anger', 'anger', 'anger',
       'anger', 'sadness', 'sadness', 'sadness', 'joy', 'joy', 'anger',
       'joy', 'sadness', 'joy', 'joy', 'anger', 'anger', 'sadness', 'joy',
       'sadness', 'anger', 'joy', 'sadness', 'anger', 'joy', 'sadness',
       'joy', 'fear', 'joy', 'joy', 'joy', 'sadness', 'joy', 'joy',
       'sadness', 'joy', 'anger', 'joy', 'joy', 'anger', 'sadness', 'joy',
       'anger', 'sadness', 'joy', 'joy', 'anger', 'joy', 'fear', 'joy',
       'surprise', 'joy', 'joy', 'anger', 'joy', 'joy', 'joy', 'fear',
       'joy', 'joy', 'surprise', 'joy', 'joy', 'joy', 'fear', 'joy',
       'fear', 'joy', 'sadness', 'joy', 'anger', 'anger', 'joy', 'anger',
       'joy', 'joy', 'fear', 'joy', 'surprise', 'anger', 'anger', 'anger',
       'joy', 'joy', 'surprise', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy',
       'fear', 'joy', 'sadness', 'joy', 'joy', 'joy', 'anger', 'joy',
       'anger', 'joy', 'fear', 'anger', 'fear', 'anger', '

In [27]:
y_pred

array(['joy', 'joy', 'anger', 'joy', 'anger', 'anger', 'anger', 'fear',
       'anger', 'sadness', 'joy', 'joy', 'joy', 'joy', 'anger', 'joy',
       'anger', 'joy', 'joy', 'anger', 'anger', 'anger', 'joy', 'anger',
       'joy', 'joy', 'anger', 'fear', 'anger', 'sadness', 'anger',
       'sadness', 'joy', 'joy', 'anger', 'anger', 'sadness', 'anger',
       'joy', 'joy', 'sadness', 'joy', 'joy', 'anger', 'joy', 'joy',
       'anger', 'joy', 'joy', 'joy', 'anger', 'joy', 'fear', 'joy', 'joy',
       'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'anger', 'joy', 'joy',
       'anger', 'joy', 'joy', 'joy', 'anger', 'joy', 'joy', 'joy', 'joy',
       'anger', 'anger', 'anger', 'joy', 'anger', 'joy', 'anger', 'anger',
       'joy', 'joy', 'anger', 'joy', 'anger', 'joy', 'anger', 'joy',
       'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'fear', 'joy', 'joy',
       'anger', 'joy', 'joy', 'anger', 'joy', 'anger', 'joy', 'joy',
       'anger', 'anger', 'anger', 'sadness', 'anger', 'joy', 'joy', 'joy'