In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

In [2]:
current = os.getcwd()

### load data
### machine learning data
train = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)

### make_result_data
ml_train_results = train[['data_num','label']].copy()
ml_test_results = test[['data_num','label']].copy()

grid_para = {'svm':{'C': 130.0, 'gamma': 0.2, 'kernel': 'rbf'},
             'rf' :{'max_depth': 16, 'n_estimators': 95, 'random_state': 42}, 
             'xgb':{'max_depth': 8, 'n_estimators': 115, 'random_state': 42}}

In [3]:
feature_col = ['week',
               'part1_patient_mean',
               'part1_slope', 'part1_std'] + ['policy1','Delta1','Omicron1','policy2'] 

scaler = StandardScaler()

X_scaled = scaler.fit_transform(train[feature_col])
X_test_scaled = scaler.transform(test[feature_col])
y_train = train['label']
y_test = test['label']

In [4]:
grid_svm = grid_para['svm']

svm_model = SVC(C=grid_svm['C'], 
            gamma=grid_svm['gamma'], 
            kernel=grid_svm['kernel'],
            probability=True)
svm_model.fit(X_scaled, y_train)



svm_pred_train = svm_model.predict(X_scaled)
svm_pred_test = svm_model.predict(X_test_scaled)


print('train acc : ', accuracy_score(y_train, svm_pred_train))
print('test acc : ', accuracy_score(y_test, svm_pred_test))
print(classification_report(y_test, svm_pred_test))

###
svm_scores = cross_val_score(svm_model,                 # ML 모델
                         X_scaled,            # train data
                         y_train,             # test data
                         scoring='accuracy',  # 예측성능평가 지표
                         cv=10)                # kfold k=cv

print('10-fold cross validation :', svm_scores)
print('cross validation mean :', np.mean(svm_scores))


proba = svm_model.predict_proba(X_test_scaled)
np.savetxt('data/svm_proba.csv',proba,delimiter=",")

# # save svm results
ml_train_results['svm'] = svm_pred_train
ml_test_results['svm'] = svm_pred_test



train acc :  0.9835082458770614
test acc :  0.9370629370629371
              precision    recall  f1-score   support

         0.0       0.96      0.95      0.95        95
         1.0       0.89      0.95      0.92        95
         2.0       0.97      0.92      0.94        96

    accuracy                           0.94       286
   macro avg       0.94      0.94      0.94       286
weighted avg       0.94      0.94      0.94       286

10-fold cross validation : [0.94029851 1.         0.92537313 0.89552239 0.94029851 0.89552239
 0.88059701 0.89393939 0.90909091 0.83333333]
cross validation mean : 0.9113975576662143


In [5]:
grid_rf = grid_para['rf']
rf_model = RandomForestClassifier(n_estimators=grid_rf['n_estimators'], 
                                  max_depth=grid_rf['max_depth'],
                                  random_state=grid_rf['random_state'],
                                  )
rf_model.fit(X_scaled, y_train)

rf_pred_train = rf_model.predict(X_scaled)
rf_pred_test = rf_model.predict(X_test_scaled)

print('train acc : ', accuracy_score(y_train, rf_pred_train))
print('test acc : ', accuracy_score(y_test, rf_pred_test))
print(classification_report(y_test, rf_pred_test))

rf_scores = cross_val_score(rf_model,                 # ML 모델
                         X_scaled,            # train data
                         y_train,             # test data
                         scoring='accuracy',  # 예측성능평가 지표
                         cv=10)                # kfold k=cv

print('10-fold cross validation :', rf_scores)
print('cross validation mean :', np.mean(rf_scores))

proba = rf_model.predict_proba(X_test_scaled)
np.savetxt('data/rf_proba.csv',proba,delimiter=",")

# save rf results
ml_train_results['rf']=rf_pred_train
ml_test_results['rf']=rf_pred_test


train acc :  1.0
test acc :  0.9475524475524476
              precision    recall  f1-score   support

         0.0       0.96      0.97      0.96        95
         1.0       0.91      0.94      0.92        95
         2.0       0.98      0.94      0.96        96

    accuracy                           0.95       286
   macro avg       0.95      0.95      0.95       286
weighted avg       0.95      0.95      0.95       286

10-fold cross validation : [0.95522388 0.97014925 0.97014925 0.94029851 0.92537313 0.91044776
 0.92537313 0.92424242 0.87878788 0.89393939]
cross validation mean : 0.9293984622342831


In [6]:
grid_xgb = grid_para['xgb']

xgb_model = XGBClassifier(n_estimators=grid_xgb['n_estimators'], 
            max_depth=grid_xgb['max_depth'],
            random_state=grid_xgb['random_state'])
xgb_model.fit(X_scaled, y_train)

xgb_pred_train = xgb_model.predict(X_scaled)
xgb_pred_test = xgb_model.predict(X_test_scaled)

print('train acc : ', accuracy_score(y_train, xgb_pred_train))
print('test acc : ', accuracy_score(y_test, xgb_pred_test))
print(classification_report(y_test, xgb_pred_test))

xgb_scores = cross_val_score(xgb_model,                 # ML 모델
                         X_scaled,            # train data
                         y_train,             # test data
                         scoring='accuracy',  # 예측성능평가 지표
                         cv=10)                # kfold k=cv

print('10-fold cross validation :', xgb_scores)
print('cross validation mean :', np.mean(xgb_scores))

proba = xgb_model.predict_proba(X_test_scaled)
np.savetxt('data/xgb_proba.csv',proba,delimiter=",")

# save xgb results
ml_train_results['xgb']=xgb_pred_train
ml_test_results['xgb']=xgb_pred_test


train acc :  1.0
test acc :  0.965034965034965
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.97        95
         1.0       0.94      0.96      0.95        95
         2.0       0.99      0.96      0.97        96

    accuracy                           0.97       286
   macro avg       0.97      0.97      0.97       286
weighted avg       0.97      0.97      0.97       286

10-fold cross validation : [0.92537313 1.         0.95522388 0.92537313 0.92537313 0.94029851
 0.94029851 0.96969697 0.90909091 0.92424242]
cross validation mean : 0.9414970601537765


In [7]:
ml_train_results.to_csv('result/ml_train_results.csv')
ml_test_results.to_csv('result/ml_test_results.csv')

In [11]:
feature_importance_dict = {'feature':feature_col,'RF':rf_model.feature_importances_, 'XGB':xgb_model.feature_importances_}
featrue_importance_df = pd.DataFrame(feature_importance_dict)
featrue_importance_df.to_csv('data/feature_importance.csv')