In [25]:
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("../data/hepatotoxicity_ALT_KlekFP_preprocessing_and_analyst.csv", header=0, index_col=False)

In [3]:
columns = list(df.columns.values)
columns.remove('ALT')

In [4]:
X = df[columns].values
y = df['ALT'].values

In [9]:
print(y)

[3.48431229 3.61630876 1.07158362 1.15057203 3.78645978 3.76607195
 3.58351894 3.12676054 3.29953373 3.5807373  3.81330703 3.33576958
 3.68887945 4.38202663 5.6094718  3.33220451 3.65325228 5.51141058
 5.56452041 4.04305127 3.27336401 3.21887582 3.4339872  3.8918203
 3.4657359  3.33220451 3.80666249 3.52636052 3.52636052 3.76120012
 3.71357207 4.21950771 3.70376807 4.11316632 3.40119738 0.81977983
 0.41210965 0.63657683 4.19870458 3.49042852 4.09434456 3.4657359
 4.11087386 3.78418963 5.39362755 3.12236492 3.25809654 3.8918203
 3.44680789 3.68887945 2.99573227 3.21887582 4.15888308 3.8918203
 4.31615389 3.49650756 3.53805656 3.60277676 3.5085559  4.01998015
 3.6323091  3.78872479 3.07269331 3.60004824 3.55820113 3.4339872
 3.58351894 3.91202301 3.36729583 4.04305127 3.55534806 3.8918203
 4.15888308 3.73528583 3.7658405  3.67122452 3.21887582 3.43075618
 3.40119738 2.99573227 3.81330703 3.39114705 3.5695327  3.14845336
 4.15888308 3.36729583 2.83321334 3.8918203  2.99573227 4.15888308
 

In [11]:
print(X)

[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 1. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

print(X_train.shape)
print(X_test.shape)

(78, 398)
(20, 398)


In [17]:
models=[SGDRegressor(), SVR(), LinearRegression(), Lasso()]
params=[
    {'alpha' : [0.001, 0.0001, 0.00001],'learning_rate' : ['constant','optimal','invscaling']},
    {'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),'C' : [0.001,0.01,0.1,1,5,10],'degree' : [3,8],'coef0' : [0.01,10,0.5],'gamma' : ('auto','scale')},
    {},
    {'alpha':[0.001, 0.005, 0.01,0.05,0.1,0.5,1]}
]

In [21]:
results=[]
for model, grid_params in zip(models, params):
    kfold = KFold(n_splits=5, random_state=123, shuffle=True)
    grid_search=GridSearchCV(estimator=model,param_grid=grid_params,scoring='neg_mean_squared_error',cv=kfold)
    grid_fit=grid_search.fit(X_train, y_train)
    results.append((grid_fit.best_estimator_, grid_fit.best_params_, grid_fit.best_score_*(-1)))

In [22]:
pd.DataFrame(results, columns=['Model', 'Params','MSE'])

Unnamed: 0,Model,Params,MSE
0,SGDRegressor(learning_rate='constant'),"{'alpha': 0.0001, 'learning_rate': 'constant'}",0.3589854
1,"SVR(C=10, coef0=0.01, gamma='auto', kernel='li...","{'C': 10, 'coef0': 0.01, 'degree': 3, 'gamma':...",0.1683632
2,LinearRegression(),{},3.817983e+24
3,Lasso(alpha=0.01),{'alpha': 0.01},0.2361231


In [23]:
best_params=results[1][1]
print(best_params)

{'C': 10, 'coef0': 0.01, 'degree': 3, 'gamma': 'auto', 'kernel': 'linear'}


In [26]:
model=SVR(C=10, coef0=10, degree=3, gamma='auto', kernel='poly')
model.fit(X_train, y_train)

Y_pred=model.predict(X_test)

mse=mean_squared_error(y_test, Y_pred)
r2=r2_score(y_test, Y_pred)
mae=mean_absolute_error(y_test, Y_pred)

In [27]:
print("MSE=", mse)
print("MAE=", mae)
print("R2=", r2)

MSE= 0.26596697035486666
MAE= 0.42011224525750157
R2= -1.085502284752776
