In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV,KFold,cross_validate
from sklearn.svm import SVR

In [2]:
data = pd.read_csv("Datasets/steel.csv")
data.head()

Unnamed: 0,normalising_temperature,tempering_temperature,percent_silicon,percent_chromium,percent_copper,percent_nickel,percent_sulphur,percent_carbon,percent_manganese,tensile_strength
0,178.5,275,0.153,0.970575,0.942,0.887,0.0,1.92,0.0,25.107613
1,178.5,950,0.153,1.212726,0.942,0.887,0.0,1.92,0.0,140.035334
2,178.5,375,0.153,1.621165,0.942,0.887,0.0,1.92,0.0,42.21765
3,178.5,900,0.153,0.809989,0.942,0.887,0.0,1.92,0.0,95.015309
4,189.525,900,0.1624,1.036229,0.849,0.9382,0.0,2.035,0.0,113.266773


In [3]:
print(data.isnull().sum())

normalising_temperature    0
tempering_temperature      0
percent_silicon            0
percent_chromium           0
percent_copper             0
percent_nickel             0
percent_sulphur            0
percent_carbon             0
percent_manganese          0
tensile_strength           0
dtype: int64


In [4]:
X = data.drop(labels="tensile_strength", axis=1, inplace=False)
y = data["tensile_strength"]

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.30, random_state=42)
print(X_train.shape)
print(X_test.shape)

(387, 9)
(166, 9)


In [5]:
#Apply train data on 10 fold cross validation , obtain train MSE and R2,validation MSE and R2

cv = KFold(n_splits=10, random_state=42, shuffle=True)
model_SVR = SVR()
scores = cross_validate(model_SVR,X_train, y_train, scoring= {"MSE":"neg_mean_squared_error", "R2":"r2"}, 
                        cv=cv, return_train_score=True,n_jobs=-1)
results_baseline_df = pd.DataFrame(scores)

#change the neg mean square error to positive 
results_baseline_df ['test_MSE']=-results_baseline_df ['test_MSE']
results_baseline_df ['train_MSE']=-results_baseline_df ['train_MSE']
print(results_baseline_df)

   fit_time  score_time     test_MSE    train_MSE   test_R2  train_R2
0  0.009431    0.004181  8299.988444  6348.190176  0.155353  0.215963
1  0.009836    0.004414  4481.775634  6747.278800  0.229852  0.204478
2  0.009992    0.004240  7059.244324  6627.640573  0.168050  0.196681
3  0.008291    0.002377  5070.908021  6657.391145  0.235842  0.204489
4  0.009112    0.004342  7675.463869  6470.763901 -0.009921  0.218371
5  0.010113    0.004742  9405.391117  6282.625856  0.110796  0.212929
6  0.010157    0.004340  5235.847279  6611.484199  0.208201  0.218977
7  0.009839    0.004234  7289.714899  6527.615507  0.220840  0.200349
8  0.004868    0.002250  6414.886586  6632.530208  0.178957  0.202734
9  0.008001    0.004375  5480.944480  6666.197671  0.250636  0.203752


In [None]:
#Hyperparameter tuning
#Define paramater grid 
param_grid ={
    "kernel":["linear","sigmoid"],
    "epsilon":[0.01, 0.1, 0.5, 1.0]
}

#10 fold cross validation
gs_cv = GridSearchCV(estimator = model_SVR,
                           param_grid=param_grid,
                           scoring = {"MSE":"neg_mean_squared_error", "R2":"r2"},
                           cv=cv,
                           refit="R2",
                           return_train_score=True)

gs_cv.fit(X_train, y_train)
results_df = pd.DataFrame(gs_cv.cv_results_)
#change the neg mean square error to positive 
results_df['mean_test_MSE']=-results_df['mean_test_MSE']

print("Best parameters:", gs_cv.best_params_)
print("Best score:", gs_cv.best_score_)
print(results_df[['param_kernel', 'param_epsilon', 'mean_test_MSE', 'mean_test_R2']])