In [1]:
#import libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.ensemble import StackingClassifier
from sklearn.metrics.cluster import contingency_matrix
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
#open the data
train = pd.read_csv('SentimentTrain_B.csv', encoding = 'utf-8')
test = pd.read_csv('SentimentTest_B.csv', encoding = 'utf-8')

In [3]:
#retrieve the input and target
with open('SentimentTrainTextB.pickle', 'rb') as f:
    X_train = pickle.load(f)
y_train = train['Column3']
with open('SentimentTestTextB.pickle', 'rb') as f:
    X_test = pickle.load(f)
y_test = test['Column3']

In [4]:
#open the lexicons
positives = pd.read_csv('positive_words_hy.txt', header = None)
negatives = pd.read_csv('negative_words_hy.txt', header = None)

In [5]:
positives = positives.rename(columns = {0: 'Lemma'})
negatives = negatives.rename(columns = {0: 'Lemma'})

In [6]:
pos_positive = []
pos_negative = []
for i in range(len(positives)):
    pos_positive.append(1)
    pos_negative.append(0)
positives['Positive'] = pos_positive
positives['Negative'] = pos_negative

neg_positive = []
neg_negative = []
for i in range(len(negatives)):
    neg_positive.append(0)
    neg_negative.append(1)
negatives['Positive'] = neg_positive
negatives['Negative'] = neg_negative

In [7]:
scores = pd.concat([positives, negatives], ignore_index = True)
scores.head()

Unnamed: 0,Lemma,Positive,Negative
0,առաջին,1,0
1,մեծ,1,0
2,ավելի,1,0
3,ինչպես,1,0
4,հայտնի,1,0


In [8]:
#count maximum number of words in X_train
count = []
for i in X_train.index:
    count.append(len(X_train[i]))
maximum = max(count)

In [9]:
#function to transform the text into a vector of size 2: [Positive, Negative]
def transform(x):
    score = []
    for i in x.index:
        s = np.zeros((maximum, 2))
        for j in range(len(x[i])):
            if x[i][j] in scores['Lemma'].values:
                ind = np.where(scores['Lemma'] == x[i][j])[0]
                pos, neg = 0, 0
                for k in range(len(ind)):
                    pos += scores['Positive'][ind[k]]
                    neg += scores['Negative'][ind[k]]
                s[j][0] = pos / len(ind)
                s[j][1] = neg / len(ind)
        score.append(s)
    return score

In [10]:
train_score = transform(X_train)
test_score = transform(X_test)

In [11]:
#open Armenian glove
words = []
embedding_index = {}

with open(r'glove.hy.300.txt', 'r', encoding = 'utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        words.append(word)
        coefs = np.asarray(values[1:], 'float32')
        embedding_index[word] = coefs

In [12]:
t = Tokenizer()
t.fit_on_texts(words)
vocab_size = len(t.word_index) + 1
encoded_docs1 = t.texts_to_sequences(X_train)
encoded_X_train = pad_sequences(encoded_docs1, maxlen = maximum, padding = 'post')
encoded_docs2 = t.texts_to_sequences(X_test)
encoded_X_test = pad_sequences(encoded_docs2, maxlen = maximum, padding = 'post')

In [13]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None and len(embedding_vector) == 300:
        embedding_matrix[i][:300] = embedding_vector

In [14]:
training_matrix_1 = np.zeros((len(X_train), maximum, 300))
training_matrix_2 = np.zeros((len(X_train), maximum, 2))
testing_matrix_1 = np.zeros((len(X_test), maximum, 300))
testing_matrix_2 = np.zeros((len(X_test), maximum, 2))

for i in range(len(X_train)):
    for j in range(maximum):
        training_matrix_1[i][j] = embedding_matrix[encoded_X_train[i][j]]
    for k in range(maximum):
        training_matrix_2[i][k] = train_score[i][k]
train_1 = training_matrix_1.reshape(len(X_train), maximum * 300)
train_2 = training_matrix_2.reshape(len(X_train), maximum * 2)
train = np.concatenate((train_1, train_2), axis = 1)

for i in range(len(X_test)):
    for j in range(maximum):
        testing_matrix_1[i][j] = embedding_matrix[encoded_X_test[i][j]]
    for k in range(maximum):
        testing_matrix_2[i][k] = test_score[i][k]
test_1 = testing_matrix_1.reshape(len(X_test), maximum * 300)
test_2 = testing_matrix_2.reshape(len(X_test), maximum * 2)
test = np.concatenate((test_1, test_2), axis = 1)

In [15]:
def search_embedding(clf, parameters):
    classifier = RandomizedSearchCV(clf, parameters, cv = 5)
    clf_search = classifier.fit(train[:, :maximum * 300], list(y_train))
    return clf_search.best_estimator_

In [16]:
def search_sentiment(clf, parameters):
    classifier = RandomizedSearchCV(clf, parameters, cv = 5)
    clf_search = classifier.fit(train[:, maximum * 300:], list(y_train))
    return clf_search.best_estimator_

In [17]:
training = pd.DataFrame(train, index = y_train.index)

In [18]:
def train_cv_embedding(clf):
    kf = StratifiedKFold(10)
    for m, _ in kf.split(training[range(maximum * 300)], y_train):
        clf.fit(training[range(maximum * 300)].loc[training[range(maximum * 300)].index.intersection(m)], y_train.loc[y_train.index.intersection(m)])
    y = clf.predict(test[:, range(maximum * 300)])
    print(y)
    print('Accuracy:', accuracy_score(y_test, y))
    print('F-measure:', f1_score(y_test, y, pos_label = 'positive'))
    print('Recall:', recall_score(y_test, y, pos_label = 'positive'))
    print('Precision:', precision_score(y_test, y, pos_label = 'positive'))
    print('Contingency:', contingency_matrix(y_test, y))
    return clf

In [19]:
def train_cv_sentiment(clf):
    kf = StratifiedKFold(10)
    for m, _ in kf.split(training[range(maximum * 300, maximum * 302)], y_train):
        clf.fit(training[range(maximum * 300, maximum * 302)].loc[training[range(maximum * 300, maximum * 302)].index.intersection(m)], y_train.loc[y_train.index.intersection(m)])
    y = clf.predict(test[:, range(maximum * 300, maximum * 302)])
    print(y)
    print('Accuracy:', accuracy_score(y_test, y))
    print('F-measure:', f1_score(y_test, y, pos_label = 'positive'))
    print('Recall:', recall_score(y_test, y, pos_label = 'positive'))
    print('Precision:', precision_score(y_test, y, pos_label = 'positive'))
    print('Contingency:', contingency_matrix(y_test, y))
    return clf

In [20]:
#sv_parameters = {'tol': [0.0001, 0.001, 0.01, 0.1, 1],
#                 'C': np.logspace(-4, 4, 20),
#                 'kernel': ['poly', 'rbf', 'sigmoid']}
lr_parameters = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                 'C': np.logspace(-4, 4, 20),
                 'tol': np.linspace(0, 10, 100)}
sv_embedding = SVC(C=545.5594781168514, tol=0.1)
lr_sentiment = LogisticRegression()
sv_embedding_best = sv_embedding
lr_sentiment_best = search_sentiment(lr_sentiment, lr_parameters)
print(sv_embedding_best)
print(lr_sentiment_best)

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.

SVC(C=545.5594781168514, tol=0.1)
LogisticRegression(C=206.913808111479, penalty='none', tol=2.525252525252525)


In [21]:
sv_embedding_trained = train_cv_embedding(sv_embedding)

['positive' 'negative' 'positive' ... 'positive' 'positive' 'positive']
Accuracy: 0.7755102040816326
F-measure: 0.8674698795180724
Recall: 0.9550173010380623
Precision: 0.7946257197696737
Contingency: [[ 46 214]
 [ 39 828]]


In [22]:
lr_sentiment_trained = train_cv_sentiment(lr_sentiment)

['positive' 'positive' 'positive' ... 'positive' 'positive' 'positive']
Accuracy: 0.7630878438331854
F-measure: 0.8638449770525242
Recall: 0.9769319492502884
Precision: 0.7742230347349177
Contingency: [[ 13 247]
 [ 20 847]]


In [23]:
estimators = [('sv', sv_embedding_trained), ('lr', lr_sentiment_trained)]
final = StackingClassifier(estimators)
final.fit(training, y_train)
y_pred = final.predict(test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [24]:
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label = 'positive')
rec = recall_score(y_test, y_pred, pos_label = 'positive')
pre = precision_score(y_test, y_pred, pos_label = 'positive')
cont = contingency_matrix(y_test, y_pred)

In [25]:
print('Accuracy:', acc)
print('F-measure:', f1)
print('Recall:', rec)
print('Precision:', pre)
print('Contingency:', cont)

Accuracy: 0.7728482697426797
F-measure: 0.8674948240165631
Recall: 0.9665513264129181
Precision: 0.7868544600938967
Contingency: [[ 33 227]
 [ 29 838]]


In [26]:
list(y_test.values)

['negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',

In [27]:
list(y_pred)

['positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',