In [11]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from os.path import isfile

import config
import utils
import data_path
import data_loader

from classifier import Classifier
from feature_generator import generate_features

In [2]:
embedding_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'embedding_fasttext.csv'), index_col=0)
print(embedding_features.head())

         F0        F1        F2        F3        F4        F5        F6  \
0 -0.016724  0.017616  0.009455 -0.013369  0.017276  0.076201  0.014863   
1 -0.058125  0.009000  0.011841 -0.012871  0.055241  0.062717  0.029732   
2  0.000008  0.030201 -0.010074 -0.028357  0.013431  0.055704  0.027906   
3  0.023368  0.074310  0.004460 -0.041999  0.041976  0.022968  0.044264   
4 -0.014248  0.029315  0.000213 -0.018357  0.036656  0.043824  0.034683   

         F7        F8        F9  ...      F292      F293      F294      F295  \
0  0.031501 -0.027082  0.349909  ...  0.027879 -0.021417 -0.000200 -0.005095   
1 -0.027903 -0.049152  0.414287  ... -0.016553 -0.012303  0.000886  0.011415   
2  0.017172 -0.062071  0.361386  ...  0.002865 -0.020125  0.004039  0.034927   
3  0.077894 -0.065665  0.312161  ... -0.000737 -0.029549 -0.004705  0.038683   
4 -0.010164 -0.040589  0.398027  ... -0.002552 -0.018372  0.010154  0.023136   

       F296      F297      F298      F299  number  label  
0 -0.0061

In [3]:
psychological_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'psychological_features.csv'), index_col=0)
print(psychological_features.head())

   PSY_F36  PSY_F37  PSY_F38  PSY_F39  PSY_F47  PSY_F48  number  label
0       25       16        0        0        0        0     289      0
1        2        5        0        1        0        2     504      0
2       19       16        0        0        0        0     262      0
3        7        9        0        0        0        0     276      0
4       33       44        0        4        0        0     510      0


In [4]:
structural_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'structural_features.csv'), index_col=0)
print(structural_features.head())

   STR_F25  STR_F26  STR_F27  STR_F28  STR_F29  number  label
0        1        1     55.0      0.0    122.5     289      0
1        0        1     18.0      0.0     92.0     504      0
2        1        1     53.0      0.0    117.5     262      0
3        0        1     20.0      0.0     80.0     276      0
4        1       10     14.3      0.0    276.5     510      0


In [5]:
syntactics_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'syntactics_features.csv'), index_col=0)
print(syntactics_features.head())

   SYN_F18   SYN_F19  SYN_F20  SYN_F21  SYN_F22   SYN_F23   SYN_F24  number  \
0      0.0  0.000000      0.0      0.0      0.0  0.000000  0.000000     289   
1      0.0  0.000000      0.0      0.0      0.0  0.000000  0.000000     504   
2      0.0  0.000000      0.0      0.0      0.0  0.012766  0.004255     262   
3      0.0  0.000000      0.0      0.0      0.0  0.000000  0.000000     276   
4      0.0  0.018083      0.0      0.0      0.0  0.000000  0.000000     510   

   label  
0      0  
1      0  
2      0  
3      0  
4      0  


In [6]:
text_dependent_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'text_dependent_features.csv'), index_col=0)
print(text_dependent_features.head())

   TD_F1     TD_F2  TD_F49  TD_F3     TD_F4  TD_F5     TD_F6  number  label
0    245  0.102041      25      1  0.179592    0.0  0.012245     289      0
1     92  0.217391      20      2  0.163043    0.0  0.000000     504      0
2    235  0.110638      26      3  0.195745    0.0  0.012766     262      0
3     80  0.250000      20      1  0.212500    0.0  0.000000     276      0
4    553  0.048825      27     16  0.249548    0.0  0.018083     510      0


In [7]:
word_dependent_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'word_dependent_features.csv'), index_col=0)
print(word_dependent_features.head())

   WD_F7     WD_F8     WD_F9    WD_F10    WD_F11  WD_F12  WD_F13      WD_F14  \
0     55  4.454545  0.763636  0.600000  0.400000      33      12  119.008264   
1     18  5.111111  0.888889  0.611111  0.388889      14       4  123.456790   
2     53  4.433962  0.830189  0.660377  0.339623      37      10   78.319687   
3     20  4.000000  0.950000  0.800000  0.200000      18       2   50.000000   
4    143  3.867133  0.601399  0.608392  0.391608      66      16  165.289256   

     WD_F15    WD_F16       WD_F17    WD_F50  number  label  
0  0.012121  0.285714  1870.088820  3.635431     289      0  
1  0.013072  0.250000  2312.297406  2.736339     504      0  
2  0.007983  0.227273  2495.612060  3.715138     262      0  
3  0.005263  0.105263  5691.891320  2.926418     276      0  
4  0.016645  0.186047  2134.023191  4.147394     510      0  


In [8]:
features_list = [embedding_features, psychological_features, structural_features, syntactics_features, text_dependent_features, word_dependent_features]

features = features_list.pop()
for feature in features_list:
      features = pd.merge(features, feature, left_on=['label', 'number'], right_on=['label', 'number'])

print(features.head())
print(features.shape)

   WD_F7     WD_F8     WD_F9    WD_F10    WD_F11  WD_F12  WD_F13      WD_F14  \
0     55  4.454545  0.763636  0.600000  0.400000      33      12  119.008264   
1     18  5.111111  0.888889  0.611111  0.388889      14       4  123.456790   
2     53  4.433962  0.830189  0.660377  0.339623      37      10   78.319687   
3     20  4.000000  0.950000  0.800000  0.200000      18       2   50.000000   
4    143  3.867133  0.601399  0.608392  0.391608      66      16  165.289256   

     WD_F15    WD_F16  ...  SYN_F22   SYN_F23   SYN_F24  TD_F1     TD_F2  \
0  0.012121  0.285714  ...      0.0  0.000000  0.000000    245  0.102041   
1  0.013072  0.250000  ...      0.0  0.000000  0.000000     92  0.217391   
2  0.007983  0.227273  ...      0.0  0.012766  0.004255    235  0.110638   
3  0.005263  0.105263  ...      0.0  0.000000  0.000000     80  0.250000   
4  0.016645  0.186047  ...      0.0  0.000000  0.000000    553  0.048825   

   TD_F49  TD_F3     TD_F4  TD_F5     TD_F6  
0      25      1

In [19]:
features = features.sample(frac=1)
features

Unnamed: 0,WD_F7,WD_F8,WD_F9,WD_F10,WD_F11,WD_F12,WD_F13,WD_F14,WD_F15,WD_F16,...,SYN_F22,SYN_F23,SYN_F24,TD_F1,TD_F2,TD_F49,TD_F3,TD_F4,TD_F5,TD_F6
1231,37,3.621622,0.783784,0.486486,0.513514,25,6,262.965668,0.027027,0.206897,...,0.0,0.0,0.0,134,0.171642,23,2,0.223881,0.0,0.000000
649,105,4.066667,0.685714,0.657143,0.342857,55,20,117.913832,0.011905,0.277778,...,0.0,0.0,0.0,427,0.060890,26,8,0.201405,0.0,0.014052
107,59,4.101695,0.745763,0.661017,0.338983,37,8,195.346165,0.019871,0.181818,...,0.0,0.0,0.0,242,0.095041,23,3,0.202479,0.0,0.004132
778,265,4.705660,0.724528,0.694340,0.305660,159,44,51.833393,0.005203,0.229167,...,0.0,0.0,0.0,1247,0.026464,33,10,0.255814,0.0,0.009623
1963,123,4.000000,0.609756,0.552846,0.447154,55,26,185.075021,0.018659,0.346667,...,0.0,0.0,0.0,492,0.048780,24,15,0.235772,0.0,0.012195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1427,60,4.133333,0.816667,0.650000,0.350000,41,12,83.333333,0.008475,0.244898,...,0.0,0.0,0.0,248,0.100806,25,9,0.298387,0.0,0.000000
1782,55,4.509091,0.836364,0.654545,0.345455,42,4,132.231405,0.013468,0.086957,...,0.0,0.0,0.0,248,0.108871,27,9,0.149194,0.0,0.028226
771,30,3.966667,0.833333,0.566667,0.433333,21,6,133.333333,0.013793,0.240000,...,0.0,0.0,0.0,119,0.159664,19,3,0.201681,0.0,0.000000
477,57,4.350877,0.789474,0.596491,0.403509,35,18,92.336103,0.009398,0.400000,...,0.0,0.0,0.0,248,0.104839,26,8,0.201613,0.0,0.000000


In [118]:

df_train = features.iloc[list(range(0,100))]
df_test = features.iloc[list(range(100,2000))]

for i in range(0,100):
    
    y = df_train['label']
    X = df_train.drop(['label', 'number'], axis=1)
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns) 

    y_test = df_test['label']
    X_test = df_test.drop(['label' , 'number'], axis=1)
    scaler = StandardScaler()
    X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns) 
    
    #feature importance
    classifier = Classifier('Random Forest', n_estimator=300)
    classifier.fit(X, y)
    feature_importance = classifier.model.feature_importances_              

    selected_features = feature_importance > 0.005            
    X = X.T[selected_features].T 

    #classification               
    classifier = Classifier('SVM' , kernel='rbf', gamma='scale')
    classifier.fit(X, y)    

    #test
    X_test = X_test.T[selected_features].T

    test_predictions_svm = classifier.predict(X_test)
    test_prediction_labels_svm = np.array(test_predictions_svm[:,1] > 0.005, dtype=int)
    print( 'step ' , i , ' : ' , accuracy_score(y_test, test_prediction_labels_svm))

    preds = classifier.predict(X_test)
    diff_preds = [abs(1 - preds[index][i]) for index, i in enumerate(y_test)]
    sort_indexes = np.argsort(diff_preds)
#     sort_indexes = sorted(range(len(diff_preds)), key=lambda k: diff_preds[k])
#     print(sort_indexes[-10:]) 
    add_list = [list(df_test.index)[i] for i in sort_indexes[-10:]]
#     print(add_list)
    df_train = df_train.append(features.loc[add_list])
    df_test = df_test.drop(add_list)
    

step  0  :  0.4994736842105263
step  1  :  0.49947089947089945
step  2  :  0.502127659574468
step  3  :  0.5
step  4  :  0.49838709677419357
step  5  :  0.5010810810810811
step  6  :  0.4983695652173913
step  7  :  0.5010928961748634
step  8  :  0.4989010989010989
step  9  :  0.5016574585635359
step  10  :  0.4988888888888889
step  11  :  0.5016759776536313
step  12  :  0.49887640449438203
step  13  :  0.5
step  14  :  0.5005681818181819
step  15  :  0.5011428571428571
step  16  :  0.4982758620689655
step  17  :  0.4976878612716763
step  18  :  0.5005813953488372
step  19  :  0.4988304093567251
step  20  :  0.5017647058823529
step  21  :  0.5005917159763313
step  22  :  0.5005952380952381
step  23  :  0.5023952095808383
step  24  :  0.49939759036144576
step  25  :  0.5018181818181818
step  26  :  0.49878048780487805
step  27  :  0.498159509202454
step  28  :  0.5012345679012346
step  29  :  0.49875776397515525
step  30  :  0.501875
step  31  :  0.4987421383647799
step  32  :  0.4987341