# Multiclass Semi Supervised SVM

This implement QNS3VM by Giseke https://github.com/NekoYIQI/QNS3VM/blob/master/qns3vm.py

for a 3 classes problem using a one vs rest approach

In order for the model to work the class labels are as follows:

- -1 : __unlabelled__
- 0 : __neutral__
- 1 : __positive__
- 2 : __negative__

In [87]:
import pandas as pd
import numpy as np
import string
import nltk
import spacy
import warnings
warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)  
#warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [88]:
np.random.seed(123)

In [89]:
# df = pd.read_json('Data.json')
df = pd.read_json('Data_rescored.json')
df.loc[df.SENTIMENT==-0.5,'SENTIMENT']=-1
df.loc[df.SENTIMENT==0.5,'SENTIMENT']=1
df.loc[(df.SENTIMENT==-1), 'SENTIMENT'] = 2
df.loc[pd.isna(df.SENTIMENT), 'SENTIMENT'] = -1
df.count()

ARTICLE      1244
BODY         1244
DATE         1244
HEADLINE     1244
SENTIMENT    1244
dtype: int64

In [90]:
unlab = df[df.SENTIMENT==-1]
lab = df[df.SENTIMENT!=-1]

Split the dataset into training and testing

In [91]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(lab, test_size=0.202, stratify=lab.SENTIMENT, random_state=123)
print('Number of training Unlabelled instances' + ' ' + str(unlab.BODY.count()) )
print('Number of training Labelled instances' + ' ' + str(train.BODY.count()) )
print('Number of test Labelled instances' + ' ' +  str(test.BODY.count()) )
train = pd.concat([train, unlab])
LU_ratio = train.BODY.count()/unlab.BODY.count() 

Number of training Unlabelled instances 901
Number of training Labelled instances 273
Number of test Labelled instances 70


## Scoring Metrics

In [92]:
from preprocessing import PREPROCESSING
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score

In [93]:
def scores(Y_test, yhat ):
    acc = round(accuracy_score(Y_test, yhat) , 4)
    f = round(f1_score(Y_test, yhat, pos_label=1, average="macro", zero_division=0) , 4)
    rec = round(recall_score(Y_test, yhat, pos_label=1, average="macro", zero_division=0) , 4)
    prec = round(precision_score(Y_test, yhat, pos_label=1, average="macro", zero_division=0) , 4)
    acc2 = round(balanced_accuracy_score(Y_test, yhat) , 4)
    scoring_temp = np.array([[acc, f, rec, prec,acc2]])
    cols = ['Accuracy', 'F-Score', 'Recall', 'Precision', 'Balanced Accuracy']
    scoring= pd.DataFrame(scoring_temp, columns=cols)
    return scoring

## Function to fit the model

Loading the model based on QNS3VM

In [94]:
from OnevsRest import ThreeClass_S3VM

In [95]:
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import vstack
from scipy.sparse import csc_matrix

In [10]:
skf = StratifiedKFold(n_splits=3) 

In [11]:
def gridCV_S3VM(X_lab, X_unlab, Y_lab, lam_range = None, lamU_range = None, sigma_range=None, Sparse = True, k='Linear' ):
    
        CROSS_VAL_RES = []
        for lam in lam_range:
            for lamU in lamU_range:

                for train_index, test_index in skf.split(X_lab, Y_lab):
                          # creating train and test among labelled instance
                    X_train_l,  X_test = X_lab[train_index], X_lab[test_index]   
                    y_train_l, y_test = Y_lab[train_index], Y_lab[test_index]

                    if Sparse:
                        X_train = vstack([X_train_l, X_unlab])   # adding the unlabelled instances
                        X_train = csc_matrix(X_train)
                        y_train = np.concatenate((y_train_l, Y_cross_unlab), axis=None)
                    else:
                        X_train = np.vstack([X_train_l, X_cross_unlab])   # adding the unlabelled instances
                        y_train = np.concatenate((y_train_l, Y_cross_unlab), axis=None)

                    if k=='RBF':
                        assert sigma_range!=None
                        for sigma in sigma_range:
                            model = ThreeClass_S3VM(lamU=lamU, lam=lam, sigma=sigma,kernel='RBF')
                            yhat = model.fit_predict(X_train, y_train, X_test)
                            score = balanced_accuracy_score(y_test, yhat['Test Predictions'])
                            CROSS_VAL_RES.append([lam, lamU, sigma, score])

                    else:
                        model= ThreeClass_S3VM(lamU=lamU, lam=lam)
                        yhat = model.fit_predict(X_train, y_train, X_test)
                        score = balanced_accuracy_score(y_test, yhat['Test Predictions'])
                        CROSS_VAL_RES.append([lam, lamU, score])

            if k=='RBF':           
                final = pd.DataFrame(CROSS_VAL_RES, columns=['Lam', 'LamU', 'Sigma', 'Score'])
            else:
                final = pd.DataFrame(CROSS_VAL_RES, columns=['Lam', 'LamU', 'Score'])

        return final

In [72]:
lams =[0.01, 0.1, 1]#, 10, 100]
lamUs = [0.01, 0.1, 1]#, 10, 100]
sigmas = [ 1 ]

## BoW approach

For BoW vectorization model we save the output as sparse matrix

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

### CountVectorizer

In [14]:
warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
count_vect = CountVectorizer(tokenizer=PREPROCESSING.process_stop, min_df=0.01, ngram_range= (1,3), max_features=1000)
X_train_cv = count_vect.fit_transform(train.ARTICLE)
X_test_cv = count_vect.transform(test.ARTICLE)

Adjusting the data format

In [15]:
X_train_cv = X_train_cv.tocsc()
X_test_cv = X_test_cv.tocsc()
Y_train = train.SENTIMENT.tolist()
Y_test = np.array(test.SENTIMENT.values)
Y_test = Y_test.astype('int')

Selecting the model according to cross validation scores

In [16]:
Y_cross = np.array(Y_train) 

In [17]:
# Adjusting the data for cross validation
idx_lab = np.where(Y_cross!=-1)
idx_unlab =  np.where(Y_cross==-1)

Y_cross_lab = Y_cross[idx_lab]
Y_cross_unlab = Y_cross[idx_unlab]

X_cross_lab = X_train_cv[idx_lab]
X_cross_unlab = X_train_cv[idx_unlab]

In [18]:
cv_CV = gridCV_S3VM(X_cross_lab, X_cross_unlab, Y_cross_lab, lam_range = lams, lamU_range = lamUs)

In [19]:
cv_results = cv_CV.groupby(['Lam','LamU'], as_index=False).agg({'Score':['mean','std']})
cv_ordered = cv_results.sort_values(by=[('Score', 'mean')], ascending=False)
cv_ordered.head(10)

Unnamed: 0_level_0,Lam,LamU,Score,Score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std
12,1.0,1.0,0.543401,0.065399
11,1.0,0.1,0.536049,0.018321
5,0.1,0.01,0.535866,0.011962
0,0.01,0.01,0.526126,0.031172
7,0.1,1.0,0.518774,0.046278
13,1.0,10.0,0.514091,0.046092
14,1.0,100.0,0.511807,0.063054
10,1.0,0.01,0.511655,0.033374
6,0.1,0.1,0.51112,0.037111
1,0.01,0.1,0.486219,0.041222


Applying the chosen model

In [20]:
lam_opt = cv_ordered.loc[0,'Lam']
lamU_opt = cv_ordered.loc[0,'LamU']

In [21]:
model_cv = ThreeClass_S3VM(lamU=lamU_opt , lam=lam_opt)
yhat_cv = model_cv.fit_predict(X_train_cv, Y_train, X_test_cv)

Training and Testing Confusion Matrix

In [22]:
pd.crosstab(np.array(Y_train), yhat_cv['Train Predictions'], rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0.0,1.0,2.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1.0,449,186,266,901
0.0,193,3,0,196
1.0,5,30,1,36
2.0,0,0,41,41
All,647,219,308,1174


In [107]:
pd.crosstab(Y_test, yhat_cv['Test Predictions'], rownames=['True'], colnames=['Predicted'], margins=True)

True,0,1,2,All
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,31,2,2,35
1.0,10,7,1,18
2.0,9,0,8,17
All,50,9,11,70


In [24]:
print(classification_report(Y_test, yhat_cv['Test Predictions'], digits = 3))
print('balanced accuracy', round(balanced_accuracy_score(Y_test, yhat_cv['Test Predictions']), 3))

              precision    recall  f1-score   support

           0      0.886     0.620     0.729        50
           1      0.389     0.778     0.519         9
           2      0.471     0.727     0.571        11

    accuracy                          0.657        70
   macro avg      0.582     0.708     0.606        70
weighted avg      0.757     0.657     0.677        70

balanced accuracy 0.708


### TF-IDF 

#### Linear S3VM

In [71]:
warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
tfidf_vect = CountVectorizer(tokenizer=PREPROCESSING.process_stop, min_df=0.01, ngram_range= (1,3), max_features=1000)
X_train_tfidf = tfidf_vect.fit_transform(train.BODY)
X_test_tfidf = tfidf_vect.transform(test.BODY)

In [26]:
X_train_tfidf = X_train_tfidf.tocsc()
X_test_tfidf = X_test_tfidf.tocsc()
Y_train = train.SENTIMENT.tolist()
Y_test = np.array(test.SENTIMENT.values)
Y_test = Y_test.astype('int')

In [27]:
Y_cross = np.array(Y_train) 

In [28]:
idx_lab = np.where(Y_cross!=-1)
idx_unlab =  np.where(Y_cross==-1)

Y_cross_lab = Y_cross[idx_lab]
Y_cross_unlab = Y_cross[idx_unlab]

X_cross_lab = X_train_tfidf[idx_lab]
X_cross_unlab = X_train_tfidf[idx_unlab]

In [29]:
tfidf_CV_lin = gridCV_S3VM(X_cross_lab, X_cross_unlab, Y_cross_lab, lams, lamUs)

In [30]:
tfidf_results_lin = tfidf_CV_lin.groupby(['Lam','LamU'], as_index=False).agg({'Score':['mean','std']})
tfidf_ordered_lin = tfidf_results_lin.sort_values(by=[('Score', 'mean')], ascending=False)
tfidf_ordered_lin.head(10)

Unnamed: 0_level_0,Lam,LamU,Score,Score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std
11,1.0,0.1,0.532656,0.015495
12,1.0,1.0,0.527752,0.018957
5,0.1,0.01,0.526892,0.038703
7,0.1,1.0,0.519812,0.041878
14,1.0,100.0,0.519434,0.044848
10,1.0,0.01,0.517368,0.058064
13,1.0,10.0,0.515666,0.030889
6,0.1,0.1,0.510021,0.04897
2,0.01,1.0,0.49601,0.056752
9,0.1,100.0,0.489094,0.019634


In [31]:
lam_opt_lin = tfidf_ordered_lin.loc[0,'Lam']
lamU_opt_lin = tfidf_ordered_lin.loc[0,'LamU']

In [32]:
model_tfidf_lin = ThreeClass_S3VM(lamU=lamU_opt_lin , lam=lam_opt_lin)
yhat_tfidf_lin = model_tfidf_lin.fit_predict(X_train_tfidf, Y_train, X_test_tfidf)

Training and Testing Confusion Matrix

In [33]:
pd.crosstab(np.array(Y_train), yhat_tfidf_lin['Train Predictions'], rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0.0,1.0,2.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1.0,467,172,262,901
0.0,192,1,3,196
1.0,3,33,0,36
2.0,1,0,40,41
All,663,206,305,1174


In [106]:
pd.crosstab(Y_test, yhat_tfidf_lin['Test Predictions'], rownames=['True'], colnames=['Predicted'], margins=True).T

True,0,1,2,All
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,36,3,2,41
1.0,6,6,3,15
2.0,8,0,6,14
All,50,9,11,70


In [108]:
print(classification_report(Y_test, yhat_tfidf_lin['Test Predictions'],zero_division=0, digits = 3))
print('balanced accuracy', round(balanced_accuracy_score(Y_test, yhat_tfidf_lin['Test Predictions']),3))

              precision    recall  f1-score   support

           0      0.878     0.720     0.791        50
           1      0.400     0.667     0.500         9
           2      0.429     0.545     0.480        11

    accuracy                          0.686        70
   macro avg      0.569     0.644     0.590        70
weighted avg      0.746     0.686     0.705        70

balanced accuracy 0.644


#### RBF S3VM

In [97]:
tfidf_vect = TfidfVectorizer(tokenizer=PREPROCESSING.process_lemmatizer, ngram_range=(1,2), min_df=0.01, max_features=500)
X_train_tfidf = tfidf_vect.fit_transform(train.ARTICLE)
X_test_tfidf = tfidf_vect.transform(test.ARTICLE)

In [98]:
X_train_tfidf = X_train_tfidf.toarray()
X_test_tfidf = X_test_tfidf.toarray()
Y_train = train.SENTIMENT.tolist()
Y_test = np.array(test.SENTIMENT.values)
Y_test = Y_test.astype('int')

In [99]:
Y_cross = np.array(Y_train) 

In [100]:
idx_lab = np.where(Y_cross!=-1)
idx_unlab =  np.where(Y_cross==-1)

Y_cross_lab = Y_cross[idx_lab]
Y_cross_unlab = Y_cross[idx_unlab]

X_cross_lab = X_train_tfidf[idx_lab]
X_cross_unlab = X_train_tfidf[idx_unlab]

In [101]:
tfidf_CV_rbf = gridCV_S3VM(X_cross_lab, X_cross_unlab, Y_cross_lab, lams, lamUs, sigmas, k='RBF', Sparse=False)

In [102]:
tfidf_results_rbf = tfidf_CV_rbf.groupby(['Lam','LamU', 'Sigma'], as_index=False).agg({'Score':['mean','std']})
tfidf_ordered_rbf = tfidf_results_rbf.sort_values(by=[('Score', 'mean')], ascending=False)
tfidf_ordered_rbf.head(10)

Unnamed: 0_level_0,Lam,LamU,Sigma,Score,Score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std
2,0.01,1.0,1,0.33956,0.010786
0,0.01,0.01,1,0.333333,0.0
1,0.01,0.1,1,0.333333,0.0
3,0.1,0.01,1,0.333333,0.0
4,0.1,0.1,1,0.333333,0.0
5,0.1,1.0,1,0.333333,0.0
6,1.0,0.01,1,0.333333,0.0
7,1.0,0.1,1,0.333333,0.0
8,1.0,1.0,1,0.333333,0.0


In [103]:
lam_opt_rbf = tfidf_ordered_rbf.loc[0,'Lam']
lamU_opt_rbf = tfidf_ordered_rbf.loc[0,'LamU']
sigma_opt_rbf = tfidf_ordered_rbf.loc[0,'Sigma']

In [104]:
model_tfidf_rbf = ThreeClass_S3VM(lamU=lamU_opt_rbf , lam=lam_opt_rbf, sigma= sigma_opt_rbf)
yhat_tfidf_rbf = model_tfidf_rbf.fit_predict(X_train_tfidf, Y_train, X_test_tfidf)

In [44]:
pd.crosstab(np.array(Y_train), yhat_tfidf_rbf['Train Predictions'], rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1
-1.0,901,901
0.0,196,196
1.0,36,36
2.0,41,41
All,1174,1174


In [45]:
pd.crosstab(Y_test, yhat_tfidf_rbf['Test Predictions'], rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,50,50
1,9,9
2,11,11
All,70,70


In [46]:
print(classification_report(Y_test, yhat_tfidf_rbf['Test Predictions'],zero_division=0))
print('balanced accuracy', round(balanced_accuracy_score(Y_test, yhat_tfidf_rbf['Test Predictions']),3))

              precision    recall  f1-score   support

           0       0.71      1.00      0.83        50
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00        11

    accuracy                           0.71        70
   macro avg       0.24      0.33      0.28        70
weighted avg       0.51      0.71      0.60        70

balanced accuracy 0.333


## Embeddings

### Word2Vec

In [47]:
nlp = spacy.load("en_core_web_lg")

In [48]:
X_train_w2v = []
for doc in train.BODY:
    doc = nlp(doc)
    X_train_w2v.append(doc.vector)

#pd.DataFrame(X_train_w2v).head()

In [49]:
X_test_w2v = []
for doc in test.BODY :
    doc = nlp(doc)
    X_test_w2v.append(doc.vector)

In [50]:
X_train_w2v = np.vstack(X_train_w2v)
X_test_w2v = np.vstack(X_test_w2v)
Y_train = train.SENTIMENT.tolist()
Y_test = np.array(test.SENTIMENT.values)
Y_test = Y_test.astype('int')

In [51]:
Y_cross = np.array(Y_train) 

In [52]:
idx_lab = np.where(Y_cross!=-1)
idx_unlab =  np.where(Y_cross==-1)

Y_cross_lab = Y_cross[idx_lab]
Y_cross_unlab = Y_cross[idx_unlab]

X_cross_lab = X_train_w2v[idx_lab]
X_cross_unlab = X_train_w2v[idx_unlab]

#### Linear S3VM

In [53]:
w2v_CV_lin = gridCV_S3VM(X_cross_lab, X_cross_unlab, Y_cross_lab, lams, lamUs,  Sparse=False)

In [54]:
w2v_results_lin = w2v_CV_lin.groupby(['Lam','LamU'], as_index=False).agg({'Score':['mean','std']})
w2v_ordered_lin = w2v_results_lin.sort_values(by=[('Score', 'mean')], ascending=False)
w2v_ordered_lin.head(10)

Unnamed: 0_level_0,Lam,LamU,Score,Score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std
1,0.01,0.1,0.33956,0.010786
2,0.01,1.0,0.334458,0.015426
0,0.01,0.01,0.333333,0.0
13,1.0,10.0,0.333333,0.0
23,100.0,10.0,0.333333,0.0
22,100.0,1.0,0.333333,0.0
21,100.0,0.1,0.333333,0.0
20,100.0,0.01,0.333333,0.0
19,10.0,100.0,0.333333,0.0
18,10.0,10.0,0.333333,0.0


In [55]:
lam_opt_lin = w2v_ordered_lin.loc[0,'Lam']
lamU_opt_lin = w2v_ordered_lin.loc[0,'LamU']

In [56]:
model_w2v_lin = ThreeClass_S3VM(lamU=lamU_opt_lin, lam=lam_opt_lin)
yhat_w2v_lin = model_w2v_lin.fit_predict(X_train_w2v, Y_train, X_test_w2v)

In [57]:
pd.crosstab(np.array(Y_train), yhat_w2v_lin['Train Predictions'], rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1
-1.0,901,901
0.0,196,196
1.0,36,36
2.0,41,41
All,1174,1174


In [58]:
pd.crosstab(Y_test, yhat_w2v_lin['Test Predictions'], rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,50,50
1,9,9
2,11,11
All,70,70


In [59]:
print(classification_report(Y_test, yhat_w2v_lin['Test Predictions'], zero_division=0))
print('balanced accuracy', round(balanced_accuracy_score(Y_test, yhat_w2v_lin['Test Predictions']),3))

              precision    recall  f1-score   support

           0       0.71      1.00      0.83        50
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00        11

    accuracy                           0.71        70
   macro avg       0.24      0.33      0.28        70
weighted avg       0.51      0.71      0.60        70

balanced accuracy 0.333


#### RBF S3VM

In [60]:
w2v_CV_rbf = gridCV_S3VM(X_cross_lab, X_cross_unlab, Y_cross_lab, lams, lamUs, sigmas, Sparse=False, k='RBF')

In [61]:
w2v_results_rbf = w2v_CV_rbf.groupby(['Lam','LamU', 'Sigma'], as_index=False).agg({'Score':['mean','std']})
w2v_ordered_rbf = w2v_results_rbf.sort_values(by=[('Score', 'mean')], ascending=False)
w2v_ordered_rbf.head(10)

Unnamed: 0_level_0,Lam,LamU,Sigma,Score,Score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std
1,0.01,0.1,1,0.33956,0.010786
2,0.01,1.0,1,0.334458,0.015426
0,0.01,0.01,1,0.333333,0.0
13,1.0,10.0,1,0.333333,0.0
23,100.0,10.0,1,0.333333,0.0
22,100.0,1.0,1,0.333333,0.0
21,100.0,0.1,1,0.333333,0.0
20,100.0,0.01,1,0.333333,0.0
19,10.0,100.0,1,0.333333,0.0
18,10.0,10.0,1,0.333333,0.0


In [62]:
lam_opt_rbf = w2v_ordered_rbf.loc[0,'Lam']
lamU_opt_rbf = w2v_ordered_rbf.loc[0,'LamU']
sigma_opt_rbf = w2v_ordered_rbf.loc[0,'Sigma']

In [63]:
model_w2v_rbf = ThreeClass_S3VM(lamU=lamU_opt_rbf, lam=lam_opt_rbf, sigma = sigma_opt_rbf,  kernel='RBF')
yhat_w2v_rbf = model_w2v_rbf.fit_predict(X_train_w2v, Y_train, X_test_w2v)

Training and Testing Confusion Matrices

In [64]:
pd.crosstab(np.array(Y_train), yhat_w2v_rbf['Train Predictions'], rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1
-1.0,901,901
0.0,196,196
1.0,36,36
2.0,41,41
All,1174,1174


In [65]:
pd.crosstab(Y_test, yhat_w2v_rbf['Test Predictions'], rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,50,50
1,9,9
2,11,11
All,70,70


In [66]:
print(classification_report(Y_test, yhat_w2v_rbf['Test Predictions'], zero_division=0))
print('balanced accuracy', round(balanced_accuracy_score(Y_test, yhat_w2v_rbf['Test Predictions']),3))

              precision    recall  f1-score   support

           0       0.71      1.00      0.83        50
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00        11

    accuracy                           0.71        70
   macro avg       0.24      0.33      0.28        70
weighted avg       0.51      0.71      0.60        70

balanced accuracy 0.333


## Results Tables

### Testing Results

In [67]:
cv_res = scores(Y_test, yhat_cv['Test Predictions'])
tfidf_res_lin = scores(Y_test, yhat_tfidf_lin['Test Predictions'])
tfidf_res_rbf = scores(Y_test, yhat_tfidf_rbf['Test Predictions'])
w2v_res_lin = scores(Y_test, yhat_w2v_lin['Test Predictions'])
w2v_res_rbf = scores(Y_test, yhat_w2v_rbf['Test Predictions'])

In [68]:
import tabulate

In [69]:
names=[['CountVectorizer'], ['Linear TF-IDF'], ['RBF TF-IDF'], ['Linear Word2Vec'], ['RBF Word2Vec']]
fields = ['method', 'accuracy', 'fscore', 'recall', 'precision', 'balanced_accuracy']
table=[]
table.append(names[0] + cv_res.values.tolist()[0])
table.append(names[1] + tfidf_res_lin.values.tolist()[0])
table.append(names[2] + tfidf_res_rbf.values.tolist()[0])
table.append(names[3] + w2v_res_lin.values.tolist()[0])
table.append(names[4] + w2v_res_rbf.values.tolist()[0])
table.sort(key= lambda x: x[5], reverse=True)
print(tabulate.tabulate(table, headers=fields))

method             accuracy    fscore    recall    precision    balanced_accuracy
---------------  ----------  --------  --------  -----------  -------------------
CountVectorizer      0.6571    0.6065    0.7084       0.5817               0.7084
Linear TF-IDF        0.6857    0.5904    0.644        0.5689               0.644
RBF TF-IDF           0.7143    0.2778    0.3333       0.2381               0.3333
Linear Word2Vec      0.7143    0.2778    0.3333       0.2381               0.3333
RBF Word2Vec         0.7143    0.2778    0.3333       0.2381               0.3333


In [70]:
a

NameError: name 'a' is not defined

In [None]:
pd.set_option('mode.chained_assignment', None)
train.loc[:,'SENTIMENT'] = yhat_tfidf['Train Predictions']
test.loc[:,'SENTIMENT'] = yhat_tfidf['Test Predictions']

In [None]:
predictions_semi = pd.concat([train,test])
predictions_semi.loc[predictions_semi.SENTIMENT == 2,'SENTIMENT'] = -1

In [None]:
predictions_semi.to_csv('pred_SEMI.csv')

## Transduction

In [None]:
tfidf_vect = TfidfVectorizer(tokenizer=PREPROCESSING.process_lemmatizer, ngram_range=(1,2), min_df=0.05, max_features=1000)
X = tfidf_vect.fit_transform(df.ARTICLE)
X = X.tocsc()

In [None]:
Y_train = df.SENTIMENT.tolist()

In [None]:
model_transd = ThreeClass_TSVM(lamU=0.001 , lam=0.001)
yhat_transd = model_tfidf.fit_predict(X, Y_train, X_test_tfidf)

In [None]:
pd.crosstab(np.array(Y_train), yhat_transd['Train Predictions'], rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
df.loc[:,'SENTIMENT'] = yhat_transd['Train Predictions']
pred_transd = df
pred_transd.loc[pred_transd.SENTIMENT == 2,'SENTIMENT'] = -1

In [None]:
pred_transd.to_csv('pred_TRANSD.csv')