# Stochastic Gradient Descent

### Data Loading

In [1]:
import pandas as pd
import numpy as np

trn_imputed_dum_X = pd.read_csv('./Dataset/trn_imputed_dum_X.csv')
tst_imputed_dum_X = pd.read_csv('./Dataset/tst_imputed_dum_X.csv')
trn_origin_Y      = pd.read_csv('./Dataset/trn_origin_Y.csv')
tst_origin_X      = pd.read_csv('./Dataset/tst_origin_X.csv')

print(trn_imputed_dum_X.shape)
print(trn_origin_Y.shape)

(79853, 17)
(79853, 1)


### 1) Hyperparameter optimization

In [4]:
#from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

param = {'alpha':[0.00001, 0.0001, 0.0005, 0.001, 0.005, 0.01]}

def clf_SGD(trn_X, trn_Y, svc_param):
    clf_SVC = linear_model.SGDClassifier()
    clf = GridSearchCV(clf_SVC, param, n_jobs=6, cv=5, scoring='roc_auc')
    clf.fit(trn_X, trn_Y)
    return clf  

### 2) Training Model

In [6]:
clf_model = clf_SGD(trn_imputed_dum_X, trn_origin_Y.values.ravel(), param)





In [8]:
clf_model.grid_scores_



[mean: 0.49399, std: 0.00376, params: {'alpha': 1e-05},
 mean: 0.49399, std: 0.00376, params: {'alpha': 0.0001},
 mean: 0.49400, std: 0.00376, params: {'alpha': 0.0005},
 mean: 0.49401, std: 0.00373, params: {'alpha': 0.001},
 mean: 0.49401, std: 0.00374, params: {'alpha': 0.005},
 mean: 0.49422, std: 0.00393, params: {'alpha': 0.01}]


### 3) DO PREDICTION

In [6]:
clf_yhat = clf_model.best_estimator_.predict(tst_imputed_dum_X)
clf_prob = clf_model.best_estimator_.predict_proba(tst_imputed_dum_X)
clf_prob_renewal = clf_prob[:,1]
display(clf_prob_renewal)

AttributeError: predict_proba is not available when  probability=False

### 4) Export to output file

In [None]:
tst_output = pd.DataFrame(tst_origin_X[['id', 'premium']])

clf_prob_df = pd.DataFrame(clf_prob_renewal.reshape(len(clf_prob_renewal),1), columns=['renewal'])
pd_incentives = pd.DataFrame(np.zeros((len(clf_prob_renewal), 1)), columns=['incentives'], dtype=np.float32)
pd_improve = pd.DataFrame(np.zeros((len(clf_prob_renewal), 1)), columns=['improvement'], dtype=np.float32)

tst_output = pd.concat([tst_out, clf_prob_df, pd_improve, pd_incentives], axis=1)
#display(tst_output)

tst_output.to_csv('./Dataset/tst_renewal_predicted_SVM.csv', index=False)