In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from os.path import isfile

import config
import utils
import data_path
import data_loader

from classifier import Classifier
from feature_generator import generate_features

In [2]:
embedding_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'embedding_fasttext.csv'), index_col=0)
print(embedding_features.head())

         F0        F1        F2        F3        F4        F5        F6  \
0 -0.016724  0.017616  0.009455 -0.013369  0.017276  0.076201  0.014863   
1 -0.058125  0.009000  0.011841 -0.012871  0.055241  0.062717  0.029732   
2  0.000008  0.030201 -0.010074 -0.028357  0.013431  0.055704  0.027906   
3  0.023368  0.074310  0.004460 -0.041999  0.041976  0.022968  0.044264   
4 -0.014248  0.029315  0.000213 -0.018357  0.036656  0.043824  0.034683   

         F7        F8        F9  ...      F292      F293      F294      F295  \
0  0.031501 -0.027082  0.349909  ...  0.027879 -0.021417 -0.000200 -0.005095   
1 -0.027903 -0.049152  0.414287  ... -0.016553 -0.012303  0.000886  0.011415   
2  0.017172 -0.062071  0.361386  ...  0.002865 -0.020125  0.004039  0.034927   
3  0.077894 -0.065665  0.312161  ... -0.000737 -0.029549 -0.004705  0.038683   
4 -0.010164 -0.040589  0.398027  ... -0.002552 -0.018372  0.010154  0.023136   

       F296      F297      F298      F299  number  label  
0 -0.0061

In [3]:
psychological_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'psychological_features.csv'), index_col=0)
print(psychological_features.head())

   PSY_F36  PSY_F37  PSY_F38  PSY_F39  PSY_F47  PSY_F48  number  label
0       25       16        0        0        0        0     289      0
1        2        5        0        1        0        2     504      0
2       19       16        0        0        0        0     262      0
3        7        9        0        0        0        0     276      0
4       33       44        0        4        0        0     510      0


In [4]:
structural_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'structural_features.csv'), index_col=0)
print(structural_features.head())

   STR_F25  STR_F26  STR_F27  STR_F28  STR_F29  number  label
0        1        1     55.0      0.0    122.5     289      0
1        0        1     18.0      0.0     92.0     504      0
2        1        1     53.0      0.0    117.5     262      0
3        0        1     20.0      0.0     80.0     276      0
4        1       10     14.3      0.0    276.5     510      0


In [5]:
syntactics_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'syntactics_features.csv'), index_col=0)
print(syntactics_features.head())

   SYN_F18   SYN_F19  SYN_F20  SYN_F21  SYN_F22   SYN_F23   SYN_F24  number  \
0      0.0  0.000000      0.0      0.0      0.0  0.000000  0.000000     289   
1      0.0  0.000000      0.0      0.0      0.0  0.000000  0.000000     504   
2      0.0  0.000000      0.0      0.0      0.0  0.012766  0.004255     262   
3      0.0  0.000000      0.0      0.0      0.0  0.000000  0.000000     276   
4      0.0  0.018083      0.0      0.0      0.0  0.000000  0.000000     510   

   label  
0      0  
1      0  
2      0  
3      0  
4      0  


In [6]:
text_dependent_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'text_dependent_features.csv'), index_col=0)
print(text_dependent_features.head())

   TD_F1     TD_F2  TD_F49  TD_F3     TD_F4  TD_F5     TD_F6  number  label
0    245  0.102041      25      1  0.179592    0.0  0.012245     289      0
1     92  0.217391      20      2  0.163043    0.0  0.000000     504      0
2    235  0.110638      26      3  0.195745    0.0  0.012766     262      0
3     80  0.250000      20      1  0.212500    0.0  0.000000     276      0
4    553  0.048825      27     16  0.249548    0.0  0.018083     510      0


In [7]:
word_dependent_features = pd.read_csv(os.path.join(data_path.DATA_PATH, 'train', 'word_dependent_features.csv'), index_col=0)
print(word_dependent_features.head())

   WD_F7     WD_F8     WD_F9    WD_F10    WD_F11  WD_F12  WD_F13      WD_F14  \
0     55  4.454545  0.763636  0.600000  0.400000      33      12  119.008264   
1     18  5.111111  0.888889  0.611111  0.388889      14       4  123.456790   
2     53  4.433962  0.830189  0.660377  0.339623      37      10   78.319687   
3     20  4.000000  0.950000  0.800000  0.200000      18       2   50.000000   
4    143  3.867133  0.601399  0.608392  0.391608      66      16  165.289256   

     WD_F15    WD_F16       WD_F17    WD_F50  number  label  
0  0.012121  0.285714  1870.088820  3.635431     289      0  
1  0.013072  0.250000  2312.297406  2.736339     504      0  
2  0.007983  0.227273  2495.612060  3.715138     262      0  
3  0.005263  0.105263  5691.891320  2.926418     276      0  
4  0.016645  0.186047  2134.023191  4.147394     510      0  


In [16]:
features_list = [embedding_features, psychological_features, structural_features, syntactics_features, text_dependent_features, word_dependent_features]

features = features_list.pop()
for feature in features_list:
      features = pd.merge(features, feature, left_on=['label', 'number'], right_on=['label', 'number'])

print(features.head())
print(features.shape)

   WD_F7     WD_F8     WD_F9    WD_F10    WD_F11  WD_F12  WD_F13      WD_F14  \
0     55  4.454545  0.763636  0.600000  0.400000      33      12  119.008264   
1     18  5.111111  0.888889  0.611111  0.388889      14       4  123.456790   
2     53  4.433962  0.830189  0.660377  0.339623      37      10   78.319687   
3     20  4.000000  0.950000  0.800000  0.200000      18       2   50.000000   
4    143  3.867133  0.601399  0.608392  0.391608      66      16  165.289256   

     WD_F15    WD_F16  ...  SYN_F22   SYN_F23   SYN_F24  TD_F1     TD_F2  \
0  0.012121  0.285714  ...      0.0  0.000000  0.000000    245  0.102041   
1  0.013072  0.250000  ...      0.0  0.000000  0.000000     92  0.217391   
2  0.007983  0.227273  ...      0.0  0.012766  0.004255    235  0.110638   
3  0.005263  0.105263  ...      0.0  0.000000  0.000000     80  0.250000   
4  0.016645  0.186047  ...      0.0  0.000000  0.000000    553  0.048825   

   TD_F49  TD_F3     TD_F4  TD_F5     TD_F6  
0      25      1

In [17]:
features

Unnamed: 0,WD_F7,WD_F8,WD_F9,WD_F10,WD_F11,WD_F12,WD_F13,WD_F14,WD_F15,WD_F16,...,SYN_F22,SYN_F23,SYN_F24,TD_F1,TD_F2,TD_F49,TD_F3,TD_F4,TD_F5,TD_F6
0,55,4.454545,0.763636,0.600000,0.400000,33,12,119.008264,0.012121,0.285714,...,0.0,0.000000,0.000000,245,0.102041,25,1,0.179592,0.0,0.012245
1,18,5.111111,0.888889,0.611111,0.388889,14,4,123.456790,0.013072,0.250000,...,0.0,0.000000,0.000000,92,0.217391,20,2,0.163043,0.0,0.000000
2,53,4.433962,0.830189,0.660377,0.339623,37,10,78.319687,0.007983,0.227273,...,0.0,0.012766,0.004255,235,0.110638,26,3,0.195745,0.0,0.012766
3,20,4.000000,0.950000,0.800000,0.200000,18,2,50.000000,0.005263,0.105263,...,0.0,0.000000,0.000000,80,0.250000,20,1,0.212500,0.0,0.000000
4,143,3.867133,0.601399,0.608392,0.391608,66,16,165.289256,0.016645,0.186047,...,0.0,0.000000,0.000000,553,0.048825,27,16,0.249548,0.0,0.018083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,22,3.909091,0.909091,0.681818,0.318182,19,0,123.966942,0.012987,0.000000,...,0.0,0.000000,0.000000,86,0.209302,18,1,0.174419,0.0,0.023256
1996,9,3.444444,1.000000,0.444444,0.555556,9,0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,31,0.354839,11,1,0.193548,0.0,0.032258
1997,58,4.896552,0.810345,0.672414,0.327586,38,14,77.288942,0.007864,0.297872,...,0.0,0.000000,0.000000,284,0.098592,28,2,0.197183,0.0,0.000000
1998,63,4.015873,0.698413,0.587302,0.412698,33,14,161.249685,0.016385,0.318182,...,0.0,0.000000,0.000000,253,0.086957,22,3,0.324111,0.0,0.015810


In [18]:
female = features[features['label']==0]
male = features[features['label']==1]


In [20]:
df_train = female.iloc[list(range(0,50))].append(male.iloc[list(range(0,50))])
df_train

Unnamed: 0,WD_F7,WD_F8,WD_F9,WD_F10,WD_F11,WD_F12,WD_F13,WD_F14,WD_F15,WD_F16,...,SYN_F22,SYN_F23,SYN_F24,TD_F1,TD_F2,TD_F49,TD_F3,TD_F4,TD_F5,TD_F6
0,55,4.454545,0.763636,0.600000,0.400000,33,12,119.008264,0.012121,0.285714,...,0.0,0.000000,0.000000,245,0.102041,25,1,0.179592,0.0,0.012245
1,18,5.111111,0.888889,0.611111,0.388889,14,4,123.456790,0.013072,0.250000,...,0.0,0.000000,0.000000,92,0.217391,20,2,0.163043,0.0,0.000000
2,53,4.433962,0.830189,0.660377,0.339623,37,10,78.319687,0.007983,0.227273,...,0.0,0.012766,0.004255,235,0.110638,26,3,0.195745,0.0,0.012766
3,20,4.000000,0.950000,0.800000,0.200000,18,2,50.000000,0.005263,0.105263,...,0.0,0.000000,0.000000,80,0.250000,20,1,0.212500,0.0,0.000000
4,143,3.867133,0.601399,0.608392,0.391608,66,16,165.289256,0.016645,0.186047,...,0.0,0.000000,0.000000,553,0.048825,27,16,0.249548,0.0,0.018083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,72,4.055556,0.736111,0.611111,0.388889,43,10,123.456790,0.012520,0.188679,...,0.0,0.000000,0.000000,292,0.089041,26,4,0.208904,0.0,0.000000
1046,214,4.331776,0.668224,0.672897,0.327103,117,28,131.889248,0.013251,0.195804,...,0.0,0.000000,0.000000,927,0.034520,32,12,0.183387,0.0,0.029126
1047,135,3.925926,0.637037,0.533333,0.466667,69,12,144.855967,0.014594,0.139535,...,0.0,0.000000,0.000000,530,0.054717,29,21,0.194340,0.0,0.016981
1048,153,4.333333,0.660131,0.627451,0.372549,72,26,71.767269,0.007224,0.257426,...,0.0,0.000000,0.000000,663,0.046757,31,10,0.217195,0.0,0.001508


In [44]:

df_train = female.iloc[list(range(0,50))].append(male.iloc[list(range(0,50))])
df_test = female.iloc[list(range(50,1000))].append(male.iloc[list(range(50,1000))])

# df_test_female = female.iloc[list(range(50,1000))]
# df_test_male = male.iloc[list(range(50,1000))]


for i in range(0,94):
    
    y = df_train['label']
    X = df_train.drop(['label', 'number'], axis=1)
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns) 
    
#     df_test = df_test_female.append(df_test_male)
    y_test = df_test['label']
    X_test = df_test.drop(['label' , 'number'], axis=1)
    scaler = StandardScaler()
    X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns) 
    
    #feature importance
    classifier = Classifier('Random Forest', n_estimator=300)
    classifier.fit(X, y)
    feature_importance = classifier.model.feature_importances_              

    selected_features = feature_importance > 0.005            
    X = X.T[selected_features].T 

    #classification               
    classifier = Classifier('SVM' , kernel='rbf', gamma='scale')
    classifier.fit(X, y)    

    #test
    X_test = X_test.T[selected_features].T

    test_predictions_svm = classifier.predict(X_test)
    test_prediction_labels_svm = np.array(test_predictions_svm[:,1] > 0.005, dtype=int)
    train_predictions_svm = classifier.predict(X)
    train_prediction_labels_svm = np.array(train_predictions_svm[:,1] > 0.005, dtype=int)
    print( 'step ' , i , ' : - train acc=' ,"{:.2f}".format(accuracy_score(train_prediction_labels_svm,y)),
            'test acc=' ,"{:.2f}".format(accuracy_score(y_test, test_prediction_labels_svm)),
             '      train length=' , df_train.shape[0])
 
    preds = classifier.predict(X_test)
    diff_preds = [(1 - preds[index][i]) for index, i in enumerate(y_test)]
    sort_indexes = np.argsort(diff_preds)
#     sort_indexes = sorted(range(len(diff_preds)), key=lambda k: diff_preds[k])
#     print(sort_indexes[-10:]) 
    c = []
    one = 0 
    zero = 0
    y_test = list(y_test)
    for idx in sort_indexes[::-1]:
#         print(idx , y_test[idx])
        if one==10 and zero==10:
            break
        if y_test[idx]==1 and one<10:
            one+=1
            c+=[idx]
        if y_test[idx]==0 and zero<10:
            zero+=1
            c+=[idx]
        else:
            continue
    add_list = [list(df_test.index)[i] for i in c]
    print(list(features['label'].loc[add_list]))
    df_train = df_train.append(features.loc[add_list])
    df_test = df_test.drop(add_list)
    

step  0  : - train acc= 0.50 test acc= 0.50       train length= 100
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
step  1  : - train acc= 0.50 test acc= 0.50       train length= 120
[0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
step  2  : - train acc= 0.50 test acc= 0.50       train length= 140
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
step  3  : - train acc= 0.50 test acc= 0.50       train length= 160
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
step  4  : - train acc= 0.50 test acc= 0.50       train length= 180
[1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0]
step  5  : - train acc= 0.50 test acc= 0.50       train length= 200
[0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0]
step  6  : - train acc= 0.50 test acc= 0.50       train length= 220
[1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]
step  7  : - train acc= 0.50 test acc= 0.50       train length= 240
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0

step  63  : - train acc= 0.50 test acc= 0.50       train length= 1360
[0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0]
step  64  : - train acc= 0.50 test acc= 0.50       train length= 1380
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
step  65  : - train acc= 0.50 test acc= 0.50       train length= 1400
[1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
step  66  : - train acc= 0.50 test acc= 0.50       train length= 1420
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
step  67  : - train acc= 0.50 test acc= 0.50       train length= 1440
[1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0]
step  68  : - train acc= 0.50 test acc= 0.50       train length= 1460
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
step  69  : - train acc= 0.50 test acc= 0.50       train length= 1480
[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0]
step  70  : - train acc= 0.50 test acc= 0.50       train length= 1500
[0, 1, 1, 1, 