In [5]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

pd.options.display.max_columns = 410

In [2]:
path = 'data/final_data_rev/'
# Load Data
TRAIN_DATA = pd.read_csv(path+'train_meta.csv')
TRAIN_META = TRAIN_DATA[['label', 'Model0', 'Model1', 'Model2', 'Model3', 'Model4']] 

TEST_DATA = pd.read_csv(path + 'test_meta.csv')
TEST_META = TEST_DATA[['label', 'Model0', 'Model1', 'Model2', 'Model3', 'Model4']] 

# Preprocessing train dataset
TRAIN_META = pd.concat([TRAIN_META['label'], pd.get_dummies(TRAIN_META.iloc[:,1:])], axis = 1)
TRAIN_META['NUM_2month'] = TRAIN_META['Model0_2month'] + TRAIN_META['Model1_2month'] + TRAIN_META['Model2_2month'] + TRAIN_META['Model3_2month'] + TRAIN_META['Model4_2month']
TRAIN_META['NUM_month'] = TRAIN_META['Model0_month'] + TRAIN_META['Model1_month'] + TRAIN_META['Model2_month'] + TRAIN_META['Model3_month'] + TRAIN_META['Model4_month']
TRAIN_META['NUM_week'] = TRAIN_META['Model0_week'] + TRAIN_META['Model1_week'] + TRAIN_META['Model2_week'] + TRAIN_META['Model3_week'] + TRAIN_META['Model4_week']
TRAIN_META['NUM_retained'] = TRAIN_META['Model0_retained'] + TRAIN_META['Model1_retained'] + TRAIN_META['Model2_retained'] + TRAIN_META['Model3_retained'] + TRAIN_META['Model4_retained']

# Preprocessing test dataset
TEST_META = pd.concat([TEST_META['label'], pd.get_dummies(TEST_META.iloc[:,1:])], axis = 1)
TEST_META['NUM_2month'] = TEST_META['Model0_2month'] + TEST_META['Model1_2month'] + TEST_META['Model2_2month'] + TEST_META['Model3_2month'] + TEST_META['Model4_2month']
TEST_META['NUM_month'] = TEST_META['Model0_month'] + TEST_META['Model1_month'] + TEST_META['Model2_month'] + TEST_META['Model3_month'] + TEST_META['Model4_month']
TEST_META['NUM_week'] = TEST_META['Model0_week'] + TEST_META['Model1_week'] + TEST_META['Model2_week'] + TEST_META['Model3_week'] + TEST_META['Model4_week']
TEST_META['NUM_retained'] = TEST_META['Model0_retained'] + TEST_META['Model1_retained'] + TEST_META['Model2_retained'] + TEST_META['Model3_retained'] + TEST_META['Model4_retained']

In [6]:
# Construct train dataset
X = TRAIN_META.drop('label', axis = 1).values
y = TRAIN_META['label'].values

# Construct test dataset
X_test = TEST_META.drop('label', axis = 1).values

# kfold
kf = KFold(n_splits = 5, shuffle = True, random_state = 1)

In [None]:
model = SVC()
accuracy = []
count = 1
for train_index, test_index in kf.split(X):
    X_train, X_val, y_train, y_val = X[train_index], X[test_index], y[train_index], y[test_index]
    model.fit(X_train, y_train)
    f1 = f1_score(y_val, model.predict(X_val), average = 'macro')
    print(count,"fold f1: ", f1)
    accuracy.append(f1)
    count += 1
print("average f1 : ", np.array(accuracy).mean())
print("std f1 : ", np.array(accuracy).std())

1 fold f1:  0.7487918524413306


In [41]:
'''
model = SVC()
model.fit(X_train, y_train)
f1_score(y_test, model.predict(X_val), average = 'macro')
'''

Unnamed: 0,Model0_2month,Model0_month,Model0_retained,Model0_week,Model1_2month,Model1_month,Model1_retained,Model1_week,Model2_2month,Model2_month,Model2_retained,Model2_week,Model3_2month,Model3_month,Model3_retained,Model3_week,Model4_2month,Model4_month,Model4_retained,Model4_week,NUM_2month,NUM_month,NUM_week,NUM_retained
0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,5
1,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,5,0,0,0
2,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,5
3,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,5,0
4,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,5


In [6]:
'''
model = RandomForestClassifier(n_estimators=300)
model.fit(X_train, y_train)

print("train score : ", f1_score(y_train, model.predict(X_train), average = 'macro'))
print("test score : ", f1_score(y_val, model.predict(X_val), average = 'macro'))
'''

train score :  0.7462200599918095
test score :  0.7462431668744021


In [14]:
model = ExtraTreesClassifier(n_estimators=300)
model.fit(X_train, y_train)

print("train score : ", f1_score(y_train, model.predict(X_train), average = 'macro'))
print("test score : ", f1_score(y_val, model.predict(X_val), average = 'macro'))

train score :  0.7462219483846372
test score :  0.7459335147888104


In [15]:
'''
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

print("train score : ", f1_score(y_train, model.predict(X_train), average = 'macro'))
print("test score : ", f1_score(y_val, model.predict(X_val), average = 'macro'))

'''

train score :  0.7462899638937354
test score :  0.7464703256031181


In [19]:
'''
model = LogisticRegression()
model.fit(X_train, y_train)

print("train score : ", f1_score(y_train, model.predict(X_train), average = 'macro'))
print("test score : ", f1_score(y_val, model.predict(X_val), average = 'macro'))
'''

train score :  0.7420102386712704
test score :  0.7456198207792556


In [21]:
model.fit(X, y)
print("train score : ", f1_score(y, model.predict(X), average = 'macro'))

train score :  0.7451346467087362


In [11]:
y_test = model.predict(X_test)
TEST_DATA['label'] = y_test
TEST_DATA[['acc_id', 'label']].to_csv(path+'result0907_stacking_RF.csv', index = False, encoding = False)

In [24]:
model.feature_importances_.reshape(-1,4)

array([[0.02667654, 0.02861547, 0.07817276, 0.13042366],
       [0.01806407, 0.01884893, 0.07149054, 0.09570259],
       [0.00829071, 0.01199422, 0.03425666, 0.0792889 ],
       [0.00538444, 0.00604851, 0.05439276, 0.03808288],
       [0.01168186, 0.00997444, 0.05747004, 0.05688546],
       [0.02368627, 0.0180026 , 0.06863335, 0.04793236]])