In [171]:
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, RepeatedKFold
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import pickle
import warnings
warnings.filterwarnings('ignore')
import lime

### Read data

In [172]:
df = pd.read_csv("../data/hepatotoxicity_ALT_KlekFP_preprocessing_and_analyst.csv", header=0, index_col=False)

In [173]:
columns = list(df.columns.values)
columns.remove('ALT')

In [174]:
X = df[columns].values
y = df['ALT'].values

In [175]:
print(y)

[3.51452607 3.64283552 1.36609165 1.42551507 3.80888225 3.78895101
 3.61091791 3.16968558 3.33576958 3.60821155 3.83514196 3.37073817
 3.71357207 4.39444915 5.61312811 3.36729583 3.67882912 5.51544285
 5.5683445  4.06044301 3.31054301 3.25809654 3.4657359  3.91202301
 3.49650756 3.36729583 3.8286414  3.55534806 3.55534806 3.78418963
 3.73766962 4.2341065  3.72810017 4.1293899  3.4339872  1.18478998
 0.92028275 1.0612565  4.21360798 3.5204608  4.11087386 3.49650756
 4.12713439 3.80666249 5.3981627  3.16547505 3.29583687 3.91202301
 3.47815842 3.71357207 3.04452244 3.25809654 4.17438727 3.91202301
 4.32941668 3.52636052 3.56671182 3.62966009 3.53805656 4.03777421
 3.65842025 3.81109709 3.11794991 3.62700405 3.58629287 3.4657359
 3.61091791 3.93182563 3.40119738 4.06044301 3.58351894 3.91202301
 4.17438727 3.75887183 3.78872479 3.69635147 3.25809654 3.46260601
 3.4339872  3.04452244 3.83514196 3.42426265 3.59731226 3.19047635
 4.17438727 3.40119738 2.89037176 3.91202301 3.04452244 4.17438

In [176]:
print(X)

[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 1. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Declare models and hyperparameters

In [177]:
models=[SGDRegressor(), SVR(), LinearRegression(), Lasso()]
params=[
    {'alpha' : [0.0, 0.01, 0.001, 0.0001, 0.00001],'learning_rate' : ['constant','optimal','invscaling'], 'random_state': [12345, 123, 666, 123123, 777, 69]},
    {'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),'C' : [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.], 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], 'degree' : [3,8],'coef0' : [0.01,10,0.5],'gamma' : ('auto','scale')},
    {},
    {'alpha':[0.001, 0.005, 0.01,0.05,0.1,0.5,1], 'random_state': [12345, 123, 666, 123123, 777, 69]}
]

In [178]:
best_results=[]
kfold = KFold(n_splits=5, random_state=2652124, shuffle=True)
for model, grid_params, in zip(models, params):
    results=[]
    cv_results=pd.DataFrame()
    for idx, (train_index, test_index) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        repeatedKfold = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2652124)
        grid_search=GridSearchCV(estimator=model,param_grid=grid_params,scoring='r2',cv=repeatedKfold)
        grid_fit=grid_search.fit(X_train, y_train)
        cv_result=pd.DataFrame(grid_search.cv_results_)
        cv_results=cv_results.append(cv_result, ignore_index=True)
    
    cv_results.to_csv("../data/all_models/" + str(model).strip("()") + ".csv", encoding='utf-8', header=True)

    results=[]

In [179]:
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01845,0.007042,0.000599,0.000489,0.001,12345,"{'alpha': 0.001, 'random_state': 12345}",0.052402,-0.025968,0.899749,0.541398,-2.634841,0.073351,-0.429653,0.693136,0.092842,0.879649,0.014207,0.974642,25
1,0.018945,0.007786,0.000304,0.000464,0.001,123,"{'alpha': 0.001, 'random_state': 123}",0.052402,-0.025968,0.899749,0.541398,-2.634841,0.073351,-0.429653,0.693136,0.092842,0.879649,0.014207,0.974642,25
2,0.020097,0.008939,0.000399,0.000489,0.001,666,"{'alpha': 0.001, 'random_state': 666}",0.052402,-0.025968,0.899749,0.541398,-2.634841,0.073351,-0.429653,0.693136,0.092842,0.879649,0.014207,0.974642,25
3,0.018568,0.007762,0.000299,0.000457,0.001,123123,"{'alpha': 0.001, 'random_state': 123123}",0.052402,-0.025968,0.899749,0.541398,-2.634841,0.073351,-0.429653,0.693136,0.092842,0.879649,0.014207,0.974642,25
4,0.01855,0.007083,0.000299,0.000457,0.001,777,"{'alpha': 0.001, 'random_state': 777}",0.052402,-0.025968,0.899749,0.541398,-2.634841,0.073351,-0.429653,0.693136,0.092842,0.879649,0.014207,0.974642,25
