In [118]:
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, RepeatedKFold
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import pickle
import warnings
warnings.filterwarnings('ignore')

In [119]:
df = pd.read_csv("../data/hepatotoxicity_ALT_KlekFP_preprocessing_and_analyst.csv", header=0, index_col=False)

In [120]:
columns = list(df.columns.values)
columns.remove('ALT')

In [121]:
X = df[columns].values
y = df['ALT'].values

In [122]:
print(y)

[3.48431229 3.61630876 1.07158362 1.15057203 3.78645978 3.76607195
 3.58351894 3.12676054 3.29953373 3.5807373  3.81330703 3.33576958
 3.68887945 4.38202663 5.6094718  3.33220451 3.65325228 5.51141058
 5.56452041 4.04305127 3.27336401 3.21887582 3.4339872  3.8918203
 3.4657359  3.33220451 3.80666249 3.52636052 3.52636052 3.76120012
 3.71357207 4.21950771 3.70376807 4.11316632 3.40119738 0.81977983
 0.41210965 0.63657683 4.19870458 3.49042852 4.09434456 3.4657359
 4.11087386 3.78418963 5.39362755 3.12236492 3.25809654 3.8918203
 3.44680789 3.68887945 2.99573227 3.21887582 4.15888308 3.8918203
 4.31615389 3.49650756 3.53805656 3.60277676 3.5085559  4.01998015
 3.6323091  3.78872479 3.07269331 3.60004824 3.55820113 3.4339872
 3.58351894 3.91202301 3.36729583 4.04305127 3.55534806 3.8918203
 4.15888308 3.73528583 3.7658405  3.67122452 3.21887582 3.43075618
 3.40119738 2.99573227 3.81330703 3.39114705 3.5695327  3.14845336
 4.15888308 3.36729583 2.83321334 3.8918203  2.99573227 4.15888308
 

In [123]:
print(X)

[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 1. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [124]:
models=[SGDRegressor(), SVR(), LinearRegression(), Lasso()]
params=[
    {'alpha' : [0.001, 0.0001, 0.00001],'learning_rate' : ['constant','optimal','invscaling'], 'random_state': [12345, 123, 666]},
    {'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),'C' : [0.001,0.01,0.1,1,5,10],'degree' : [3,8],'coef0' : [0.01,10,0.5],'gamma' : ('auto','scale')},
    {},
    {'alpha':[0.001, 0.005, 0.01,0.05,0.1,0.5,1], 'random_state': [12345, 123, 666]}
]

In [125]:
best_results=[]
cv_results=pd.DataFrame()
kfold = KFold(n_splits=5, random_state=2652124, shuffle=True)
for idx, (train_index, test_index) in enumerate(kfold.split(X)):
    results=[]
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    for model, grid_params, in zip(models, params):
        repeatedKfold = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2652124)
        grid_search=GridSearchCV(estimator=model,param_grid=grid_params,scoring='r2',cv=repeatedKfold)
        grid_fit=grid_search.fit(X_train, y_train)
        results.append((grid_fit.best_estimator_, grid_fit.best_params_, grid_fit.best_score_*(-1)))
        cv_result=pd.DataFrame(grid_search.cv_results_)
        cv_result["Model"]=str(model)
        cv_results=cv_results.append(cv_result, ignore_index=True)
        
    res=pd.DataFrame(results, columns=['Model', 'Params','r2'])
    res=res.sort_values(by='r2').reset_index(drop=True)
    best=res.iloc[0]
    
    reg=best.Model
    reg.fit(X_train, y_train)
    Y_pred=reg.predict(X_test)

    mse=mean_squared_error(y_test, Y_pred)
    r2=r2_score(y_test, Y_pred)
    mae=mean_absolute_error(y_test, Y_pred)
    
    best_results.append({'Model' : reg, 'Params' : str(best.Params), 'MSE': mse, 'MAE': mae, 'R2' : r2})

    results=[]

In [126]:
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_learning_rate,param_random_state,params,split0_test_score,split1_test_score,...,split9_test_score,mean_test_score,std_test_score,rank_test_score,Model,param_C,param_coef0,param_degree,param_gamma,param_kernel
0,0.001495,0.000501,0.000306,0.000468,0.001,constant,12345,"{'alpha': 0.001, 'learning_rate': 'constant', ...",-1.322940e-01,-5.376607e+00,...,8.819878e-01,-8.908223e-01,1.913230e+00,16,SGDRegressor(),,,,,
1,0.001599,0.000499,0.000203,0.000407,0.001,constant,123,"{'alpha': 0.001, 'learning_rate': 'constant', ...",-1.318519e-01,-4.963241e+00,...,8.250044e-01,-6.628415e-01,1.681750e+00,1,SGDRegressor(),,,,,
2,0.000850,0.000780,0.000301,0.000460,0.001,constant,666,"{'alpha': 0.001, 'learning_rate': 'constant', ...",-7.755095e-02,-5.309698e+00,...,8.944490e-01,-7.881012e-01,1.879138e+00,13,SGDRegressor(),,,,,
3,0.003131,0.006262,0.000000,0.000000,0.001,optimal,12345,"{'alpha': 0.001, 'learning_rate': 'optimal', '...",-3.343598e+26,-6.226876e+26,...,-1.290745e+26,-4.224252e+26,4.047689e+26,20,SGDRegressor(),,,,,
4,0.003128,0.006255,0.000000,0.000000,0.001,optimal,123,"{'alpha': 0.001, 'learning_rate': 'optimal', '...",-1.960848e+26,-7.203636e+26,...,-6.296035e+26,-3.891503e+26,3.587583e+26,19,SGDRegressor(),,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1680,0.001599,0.000492,0.000400,0.000490,0.5,,123,"{'alpha': 0.5, 'random_state': 123}",-1.414056e-01,-6.923646e-02,...,-6.324794e-02,-7.971798e-02,4.815597e-02,16,Lasso(),,,,,
1681,0.001637,0.000531,0.000564,0.000472,0.5,,666,"{'alpha': 0.5, 'random_state': 666}",-1.414056e-01,-6.923646e-02,...,-6.324794e-02,-7.971798e-02,4.815597e-02,16,Lasso(),,,,,
1682,0.001700,0.000641,0.000500,0.000500,1,,12345,"{'alpha': 1, 'random_state': 12345}",-1.414056e-01,-6.923646e-02,...,-6.324794e-02,-7.971798e-02,4.815597e-02,16,Lasso(),,,,,
1683,0.001500,0.000499,0.000600,0.000490,1,,123,"{'alpha': 1, 'random_state': 123}",-1.414056e-01,-6.923646e-02,...,-6.324794e-02,-7.971798e-02,4.815597e-02,16,Lasso(),,,,,


In [127]:
best_model=pd.DataFrame(best_results)
best_model=best_model.sort_values(by='R2', ascending=False).reset_index(drop=True)
best_model

Unnamed: 0,Model,Params,MSE,MAE,R2
0,"Lasso(alpha=0.01, random_state=12345)","{'alpha': 0.01, 'random_state': 12345}",0.112513,0.283451,0.867253
1,"SVR(C=10, coef0=0.01, gamma='auto', kernel='li...","{'C': 10, 'coef0': 0.01, 'degree': 3, 'gamma':...",0.160354,0.299413,0.622838
2,"SVR(C=10, coef0=0.01, gamma='auto')","{'C': 10, 'coef0': 0.01, 'degree': 3, 'gamma':...",0.19223,0.276311,0.572242
3,"SVR(C=0.1, coef0=0.01, gamma='auto', kernel='l...","{'C': 0.1, 'coef0': 0.01, 'degree': 3, 'gamma'...",0.331422,0.370356,0.538584
4,"Lasso(alpha=0.01, random_state=12345)","{'alpha': 0.01, 'random_state': 12345}",0.473591,0.399823,0.335749


In [128]:
model=best_model.iloc[0].Model
with open('../data/best_model.pickle', 'wb') as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [129]:
# with open('filename.pickle', 'rb') as handle:
#     b = pickle.load(handle)

# print a == b