In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

pd.options.display.max_columns = 410

In [2]:
# Load Data
TRAIN_META = pd.read_csv('data/train_meta.csv')
TEST_META = pd.read_csv('data/test_meta.csv')

In [3]:
# Load Data
TRAIN_META = pd.read_csv('data/train_meta.csv')
TEST_META = pd.read_csv('data/test_meta.csv')

# Preprocessing train dataset
TRAIN_META = pd.concat([TRAIN_META['label'], pd.get_dummies(TRAIN_META.iloc[:,2:])], axis = 1)
TRAIN_META['NUM_2month'] = (TRAIN_META['Model0_2month'] + TRAIN_META['Model1_2month'] + TRAIN_META['Model2_2month'] + 
                            TRAIN_META['Model3_2month'] + TRAIN_META['Model4_2month'] + TRAIN_META['Model5_2month'] +
                            TRAIN_META['Model6_2month'] + TRAIN_META['Model7_2month'] + TRAIN_META['Model8_2month'])
TRAIN_META['NUM_month'] = (TRAIN_META['Model0_month'] + TRAIN_META['Model1_month'] + TRAIN_META['Model2_month'] + 
                           TRAIN_META['Model3_month'] + TRAIN_META['Model4_month'] + TRAIN_META['Model5_month'] +
                           TRAIN_META['Model6_month'] + TRAIN_META['Model7_month'] + TRAIN_META['Model8_month'])
TRAIN_META['NUM_week'] = (TRAIN_META['Model0_week'] + TRAIN_META['Model1_week'] + TRAIN_META['Model2_week'] + 
                          TRAIN_META['Model3_week'] + TRAIN_META['Model4_week'] + TRAIN_META['Model5_week'] +
                          TRAIN_META['Model6_week'] + TRAIN_META['Model7_week'] + TRAIN_META['Model8_week'])
TRAIN_META['NUM_retained'] = (TRAIN_META['Model0_retained'] + TRAIN_META['Model1_retained'] + TRAIN_META['Model2_retained'] + 
                              TRAIN_META['Model3_retained'] + TRAIN_META['Model4_retained'] + TRAIN_META['Model5_retained'] +
                              TRAIN_META['Model6_retained'] + TRAIN_META['Model7_retained'] + TRAIN_META['Model8_retained'])

# Preprocessing test dataset
TEST_META = pd.concat([TEST_META['label'], pd.get_dummies(TEST_META.iloc[:,2:])], axis = 1)
TEST_META['NUM_2month'] = (TEST_META['Model0_2month'] + TEST_META['Model1_2month'] + TEST_META['Model2_2month'] + 
                           TEST_META['Model3_2month'] + TEST_META['Model4_2month'] + TEST_META['Model5_2month'] + 
                           TEST_META['Model6_2month'] + TEST_META['Model7_2month'] + TEST_META['Model8_2month'])
TEST_META['NUM_month'] = (TEST_META['Model0_month'] + TEST_META['Model1_month'] + TEST_META['Model2_month'] + 
                          TEST_META['Model3_month'] + TEST_META['Model4_month'] + TEST_META['Model5_month'] + 
                          TEST_META['Model6_month'] + TEST_META['Model7_month'] + TEST_META['Model8_month'])
TEST_META['NUM_week'] = (TEST_META['Model0_week'] + TEST_META['Model1_week'] + TEST_META['Model2_week'] + 
                         TEST_META['Model3_week'] + TEST_META['Model4_week'] + TEST_META['Model5_week'] + 
                         TEST_META['Model6_week'] + TEST_META['Model7_week'] + TEST_META['Model8_week'])
TEST_META['NUM_retained'] = (TEST_META['Model0_retained'] + TEST_META['Model1_retained'] + TEST_META['Model2_retained'] + 
                             TEST_META['Model3_retained'] + TEST_META['Model4_retained'] + TEST_META['Model5_retained'] + 
                             TEST_META['Model6_retained'] + TEST_META['Model7_retained'] + TEST_META['Model8_retained'])

In [4]:
FULL = pd.concat([TRAIN_META, TEST_META], ignore_index=True)
X = FULL.drop(['label'],axis = 1)
X = X.values

pca = PCA(0.98)
pca_result = pca.fit_transform(X)
FULL = pd.concat([FULL, pd.DataFrame(pca_result)], axis = 1)
TRAIN_META = FULL.loc[0:99999,:]
TEST_META = FULL.loc[100000:139999,:]

In [7]:
# Construct train dataset
X = TRAIN_META.drop('label', axis = 1).values
y = TRAIN_META['label'].values

# Construct test dataset
X_test = TEST_META.drop('label', axis = 1).values

# kfold
kf = KFold(n_splits = 5, shuffle = True, random_state = 1)

In [8]:
model = RandomForestClassifier(n_estimators = 150, random_state = 1)
accuracy = []
count = 1
for train_index, test_index in kf.split(X):
    X_train, X_val, y_train, y_val = X[train_index], X[test_index], y[train_index], y[test_index]
    model.fit(X_train, y_train)
    f1 = f1_score(y_val, model.predict(X_val), average = 'macro')
    print(count,"fold f1: ", f1)
    accuracy.append(f1)
    count += 1
print("average f1 : ", np.array(accuracy).mean())
print("std f1 : ", np.array(accuracy).std())

1 fold f1:  0.7470136097880241
2 fold f1:  0.7377908042082493
3 fold f1:  0.746262331267326
4 fold f1:  0.7430938856882866
5 fold f1:  0.7370559173979307
average f1 :  0.7422433096699634
std f1 :  0.004156094822063659


In [9]:
model = RandomForestClassifier(n_estimators = 200, random_state = 1)
accuracy = []
count = 1
for train_index, test_index in kf.split(X):
    X_train, X_val, y_train, y_val = X[train_index], X[test_index], y[train_index], y[test_index]
    model.fit(X_train, y_train)
    f1 = f1_score(y_val, model.predict(X_val), average = 'macro')
    print(count,"fold f1: ", f1)
    accuracy.append(f1)
    count += 1
print("average f1 : ", np.array(accuracy).mean())
print("std f1 : ", np.array(accuracy).std())

1 fold f1:  0.7473662647567574
2 fold f1:  0.737557239910768
3 fold f1:  0.745997596956999
4 fold f1:  0.7431398139411813
5 fold f1:  0.7376096507245291
average f1 :  0.742334113258047
std f1 :  0.004111736380553144


In [10]:
model = ExtraTreesClassifier(n_estimators = 150, random_state = 1)
accuracy = []
count = 1
for train_index, test_index in kf.split(X):
    X_train, X_val, y_train, y_val = X[train_index], X[test_index], y[train_index], y[test_index]
    model.fit(X_train, y_train)
    f1 = f1_score(y_val, model.predict(X_val), average = 'macro')
    print(count,"fold f1: ", f1)
    accuracy.append(f1)
    count += 1
print("average f1 : ", np.array(accuracy).mean())
print("std f1 : ", np.array(accuracy).std())

1 fold f1:  0.7471094065354127
2 fold f1:  0.7372297201263883
3 fold f1:  0.7456525371495972
4 fold f1:  0.7422935647465756
5 fold f1:  0.7377495674121302
average f1 :  0.7420069591940208
std f1 :  0.00400885889346186


In [11]:
model = SVC()
accuracy = []
count = 1
for train_index, test_index in kf.split(X):
    X_train, X_val, y_train, y_val = X[train_index], X[test_index], y[train_index], y[test_index]
    model.fit(X_train, y_train)
    f1 = f1_score(y_val, model.predict(X_val), average = 'macro')
    print(count,"fold f1: ", f1)
    accuracy.append(f1)
    count += 1
print("average f1 : ", np.array(accuracy).mean())
print("std f1 : ", np.array(accuracy).std())

1 fold f1:  0.7512543375316441
2 fold f1:  0.7431723296274357
3 fold f1:  0.7475411512965358
4 fold f1:  0.7445979392683681
5 fold f1:  0.7427679240422658
average f1 :  0.7458667363532498
std f1 :  0.003171820399869984
