In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from os.path import isfile

import config
import utils
import data_path
import data_loader

from classifier import Classifier
from feature_generator import generate_features

In [2]:
embedding_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'embedding_fasttext.csv'), index_col=0)
psychological_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'psychological_features.csv'), index_col=0)
structural_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'structural_features.csv'), index_col=0)
syntactics_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'syntactics_features.csv'), index_col=0)
text_dependent_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'text_dependent_features.csv'), index_col=0)
word_dependent_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'word_dependent_features.csv'), index_col=0)


In [3]:
features_list = [embedding_features, psychological_features, structural_features, syntactics_features, text_dependent_features, word_dependent_features]

features = features_list.pop()
for feature in features_list:
      features = pd.merge(features, feature, left_on=['label', 'number'], right_on=['label', 'number'])

features

Unnamed: 0,WD_F7,WD_F8,WD_F9,WD_F10,WD_F11,WD_F12,WD_F13,WD_F14,WD_F15,WD_F16,...,SYN_F22,SYN_F23,SYN_F24,TD_F1,TD_F2,TD_F49,TD_F3,TD_F4,TD_F5,TD_F6
0,55,4.454545,0.763636,0.600000,0.400000,33,12,119.008264,0.012121,0.285714,...,0.0,0.000000,0.000000,245,0.102041,25,1,0.179592,0.0,0.012245
1,18,5.111111,0.888889,0.611111,0.388889,14,4,123.456790,0.013072,0.250000,...,0.0,0.000000,0.000000,92,0.217391,20,2,0.163043,0.0,0.000000
2,53,4.433962,0.830189,0.660377,0.339623,37,10,78.319687,0.007983,0.227273,...,0.0,0.012766,0.004255,235,0.110638,26,3,0.195745,0.0,0.012766
3,20,4.000000,0.950000,0.800000,0.200000,18,2,50.000000,0.005263,0.105263,...,0.0,0.000000,0.000000,80,0.250000,20,1,0.212500,0.0,0.000000
4,143,3.867133,0.601399,0.608392,0.391608,66,16,165.289256,0.016645,0.186047,...,0.0,0.000000,0.000000,553,0.048825,27,16,0.249548,0.0,0.018083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,22,3.909091,0.909091,0.681818,0.318182,19,0,123.966942,0.012987,0.000000,...,0.0,0.000000,0.000000,86,0.209302,18,1,0.174419,0.0,0.023256
1996,9,3.444444,1.000000,0.444444,0.555556,9,0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,31,0.354839,11,1,0.193548,0.0,0.032258
1997,58,4.896552,0.810345,0.672414,0.327586,38,14,77.288942,0.007864,0.297872,...,0.0,0.000000,0.000000,284,0.098592,28,2,0.197183,0.0,0.000000
1998,63,4.015873,0.698413,0.587302,0.412698,33,14,161.249685,0.016385,0.318182,...,0.0,0.000000,0.000000,253,0.086957,22,3,0.324111,0.0,0.015810


In [4]:
embedding_features_test = pd.read_csv(os.path.join(data_path.DATA_PATH, 'test', 'embedding_fasttext.csv'), index_col=0)
psychological_features_test = pd.read_csv(os.path.join(data_path.DATA_PATH, 'test', 'psychological_features.csv'), index_col=0)
structural_features_test = pd.read_csv(os.path.join(data_path.DATA_PATH, 'test', 'structural_features.csv'), index_col=0)
syntactics_features_test = pd.read_csv(os.path.join(data_path.DATA_PATH, 'test', 'syntactics_features.csv'), index_col=0)
text_dependent_features_test = pd.read_csv(os.path.join(data_path.DATA_PATH, 'test', 'text_dependent_features.csv'), index_col=0)
word_dependent_features_test = pd.read_csv(os.path.join(data_path.DATA_PATH, 'test', 'word_dependent_features.csv'), index_col=0)

features_list_test = [embedding_features_test, psychological_features_test, structural_features_test, syntactics_features_test, text_dependent_features_test, word_dependent_features_test]

features_test = features_list_test.pop()
for feature in features_list_test:
      features_test = pd.merge(features_test, feature, left_on=['label', 'number'], right_on=['label', 'number'])

features_test

Unnamed: 0,WD_F7,WD_F8,WD_F9,WD_F10,WD_F11,WD_F12,WD_F13,WD_F14,WD_F15,WD_F16,...,SYN_F22,SYN_F23,SYN_F24,TD_F1,TD_F2,TD_F49,TD_F3,TD_F4,TD_F5,TD_F6
0,18,3.611111,0.833333,0.500000,0.500000,13,2,246.913580,0.026144,0.133333,...,0.030769,0.0,0.0,65,0.276923,18,1,0.215385,0.0,0.000000
1,51,3.901961,0.745098,0.588235,0.411765,33,4,276.816609,0.028235,0.105263,...,0.010050,0.0,0.0,199,0.125628,25,0,0.180905,0.0,0.030151
2,102,4.156863,0.745098,0.598039,0.401961,61,22,105.728566,0.010678,0.289474,...,0.000000,0.0,0.0,424,0.068396,29,7,0.214623,0.0,0.009434
3,152,3.875000,0.611842,0.572368,0.427632,72,14,147.160665,0.014814,0.150538,...,0.000000,0.0,0.0,589,0.047538,28,7,0.207131,0.0,0.018676
4,177,4.237288,0.593220,0.576271,0.423729,72,32,104.695330,0.010529,0.304762,...,0.000000,0.0,0.0,750,0.042667,32,16,0.181333,0.0,0.016000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,94,3.734043,0.702128,0.531915,0.468085,51,20,131.281123,0.013269,0.303030,...,0.000000,0.0,0.0,351,0.076923,27,9,0.196581,0.0,0.022792
374,169,4.100592,0.674556,0.585799,0.414201,94,18,122.544729,0.012327,0.157895,...,0.000000,0.0,0.0,693,0.041847,29,18,0.190476,0.0,0.017316
375,7,3.142857,1.000000,0.428571,0.571429,7,0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,22,0.454545,10,0,0.181818,0.0,0.000000
376,26,3.807692,0.807692,0.692308,0.307692,16,10,147.928994,0.015385,0.476190,...,0.000000,0.0,0.0,99,0.161616,16,3,0.171717,0.0,0.010101


In [5]:
female = features[features['label']==0]
male = features[features['label']==1]

In [6]:

df_train = female.iloc[list(range(0,100))].append(male.iloc[list(range(0,100))])
df_test = female.iloc[list(range(100,1000))].append(male.iloc[list(range(100,1000))])

X_TEST = features_test.drop(['label', 'number'], axis=1)
Y_TEST = features_test['label']
scaler = StandardScaler()
X_TEST = pd.DataFrame(scaler.fit_transform(X_TEST), columns=X_TEST.columns) 

classifier = Classifier('SVM' , kernel='rbf', gamma='scale')

for i in range(0,60):
    
    y = df_train['label']
    X = df_train.drop(['label', 'number'], axis=1)
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns) 
    
    y_test = df_test['label']
    X_test = df_test.drop(['label' , 'number'], axis=1)
    scaler = StandardScaler()
    X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns) 

    #classification               
    classifier.fit(X, y)    

    test_predictions_svm = classifier.predict_label(X_TEST)
    train_predictions_svm = classifier.predict_label(X)
    print( 'step ' , i , ' : - train acc=' ,"{:.2f}".format(accuracy_score(train_predictions_svm,y)),
            'test acc=' ,"{:.2f}".format(accuracy_score(Y_TEST, test_predictions_svm)),
             '      train length=' , df_train.shape[0])
 
    preds = classifier.predict(X_test)
#     diff_preds = [(1 - preds[index][i]) for index, i in enumerate(y_test)]
    diff_preds = [abs(preds[index][0] - preds[index][1]) for index, i in enumerate(y_test)]

    sort_indexes = np.argsort(diff_preds)
    c = []
    one = 0 
    zero = 0
    y_test = list(y_test)
    for idx in sort_indexes:
        if one==10 and zero==10:
            break
        if y_test[idx]==1 and one<10:
            one+=1
            c+=[idx]
        if y_test[idx]==0 and zero<10:
            zero+=1
            c+=[idx]
        else:
            continue
    add_list = [list(df_test.index)[i] for i in c]    
#     add_list = [list(df_test.index)[i] for i in sort_indexes[-10:]]

    df_train = df_train.append(features.loc[add_list])
    df_test = df_test.drop(add_list)
    

step  0  : - train acc= 0.91 test acc= 0.56       train length= 200
step  1  : - train acc= 0.92 test acc= 0.58       train length= 220
step  2  : - train acc= 0.92 test acc= 0.59       train length= 240
step  3  : - train acc= 0.93 test acc= 0.60       train length= 260
step  4  : - train acc= 0.93 test acc= 0.57       train length= 280
step  5  : - train acc= 0.93 test acc= 0.56       train length= 300
step  6  : - train acc= 0.91 test acc= 0.55       train length= 320
step  7  : - train acc= 0.93 test acc= 0.56       train length= 340
step  8  : - train acc= 0.93 test acc= 0.55       train length= 360
step  9  : - train acc= 0.92 test acc= 0.54       train length= 380
step  10  : - train acc= 0.93 test acc= 0.53       train length= 400
step  11  : - train acc= 0.93 test acc= 0.56       train length= 420
step  12  : - train acc= 0.93 test acc= 0.56       train length= 440
step  13  : - train acc= 0.93 test acc= 0.56       train length= 460
step  14  : - train acc= 0.93 test acc= 0.56