<a href="https://colab.research.google.com/github/pentakll4002/supportVector/blob/main/Support_Vector_Regression_Implementation_Using_(RandomizedResearchCV).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [29]:
## Dataset (Tips Dataset)
df = sns.load_dataset('tips')

In [30]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [32]:
df['sex'].value_counts()

Unnamed: 0_level_0,count
sex,Unnamed: 1_level_1
Male,157
Female,87


In [33]:
df['smoker'].value_counts()

Unnamed: 0_level_0,count
smoker,Unnamed: 1_level_1
No,151
Yes,93


In [34]:
df['day'].value_counts()

Unnamed: 0_level_0,count
day,Unnamed: 1_level_1
Sat,87
Sun,76
Thur,62
Fri,19


In [35]:
df['time'].value_counts()

Unnamed: 0_level_0,count
time,Unnamed: 1_level_1
Dinner,176
Lunch,68


## Feature Encoding (Label Encoding And OneHotEncoding)

In [36]:
## Independent and Dependent Features
X = df.iloc[:, 1:]
y = df['total_bill']

In [37]:
X

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,Female,No,Sun,Dinner,2
1,1.66,Male,No,Sun,Dinner,3
2,3.50,Male,No,Sun,Dinner,3
3,3.31,Male,No,Sun,Dinner,2
4,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,5.92,Male,No,Sat,Dinner,3
240,2.00,Female,Yes,Sat,Dinner,2
241,2.00,Male,Yes,Sat,Dinner,2
242,1.75,Male,No,Sat,Dinner,2


In [38]:
y

Unnamed: 0,total_bill
0,16.99
1,10.34
2,21.01
3,23.68
4,24.59
...,...
239,29.03
240,27.18
241,22.67
242,17.82


In [39]:
### Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

In [40]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
58,1.76,Male,Yes,Sat,Dinner,2
1,1.66,Male,No,Sun,Dinner,3
2,3.5,Male,No,Sun,Dinner,3
68,2.01,Male,No,Sat,Dinner,2
184,3.0,Male,Yes,Sun,Dinner,2


In [41]:
y_train

Unnamed: 0,total_bill
58,11.24
1,10.34
2,21.01
68,20.23
184,40.55
...,...
64,17.59
15,21.58
228,13.28
125,29.80


In [42]:
### Feature Encoding (Label Encoding and OneHot Encoding)
# Classification (2) => Label Encoding
# Classification (n) with n > 2 =>> OneHot Encoding

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()

In [43]:
import warnings
warnings.filterwarnings('ignore')

le1.fit(pd.concat([X_train['sex'], X_test['sex']]))
le2.fit(pd.concat([X_train['smoker'], X_test['smoker']]))
le3.fit(pd.concat([X_train['time'], X_test['time']]))


X_train['sex'] = le1.fit_transform(X_train['sex'])
X_train['smoker'] = le2.fit_transform(X_train['smoker'])
X_train['time'] = le3.fit_transform(X_train['time'])

In [44]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
58,1.76,1,1,Sat,0,2
1,1.66,1,0,Sun,0,3
2,3.5,1,0,Sun,0,3
68,2.01,1,0,Sat,0,2
184,3.0,1,1,Sun,0,2


In [45]:
X_test['sex'] = le1.transform(X_test['sex'])
X_test['smoker'] = le2.transform(X_test['smoker'])
X_test['time'] = le3.transform(X_test['time'])

In [46]:
X_test.head()

Unnamed: 0,tip,sex,smoker,day,time,size
162,2.0,0,0,Sun,0,3
60,3.21,1,1,Sat,0,2
61,2.0,1,1,Sat,0,2
63,3.76,1,1,Sat,0,4
69,2.09,1,1,Sat,0,2


In [47]:
## OneHot Encoding -- Columns transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

### 3 is index to dataframe ('day')
ct = ColumnTransformer(transformers=[('onehot', OneHotEncoder(drop='first'), [3]),
                                     ], remainder='passthrough')

In [48]:
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

In [49]:
X_test

array([[0.  , 1.  , 0.  , 2.  , 0.  , 0.  , 0.  , 3.  ],
       [1.  , 0.  , 0.  , 3.21, 1.  , 1.  , 0.  , 2.  ],
       [1.  , 0.  , 0.  , 2.  , 1.  , 1.  , 0.  , 2.  ],
       [1.  , 0.  , 0.  , 3.76, 1.  , 1.  , 0.  , 4.  ],
       [1.  , 0.  , 0.  , 2.09, 1.  , 1.  , 0.  , 2.  ],
       [0.  , 0.  , 1.  , 5.  , 1.  , 1.  , 1.  , 2.  ],
       [0.  , 1.  , 0.  , 3.51, 1.  , 0.  , 0.  , 2.  ],
       [1.  , 0.  , 0.  , 5.16, 1.  , 1.  , 0.  , 4.  ],
       [0.  , 1.  , 0.  , 5.  , 1.  , 0.  , 0.  , 2.  ],
       [1.  , 0.  , 0.  , 3.6 , 1.  , 0.  , 0.  , 3.  ],
       [0.  , 1.  , 0.  , 5.65, 1.  , 1.  , 0.  , 2.  ],
       [1.  , 0.  , 0.  , 2.5 , 0.  , 1.  , 0.  , 3.  ],
       [0.  , 0.  , 1.  , 1.44, 1.  , 0.  , 1.  , 2.  ],
       [1.  , 0.  , 0.  , 3.09, 0.  , 1.  , 0.  , 4.  ],
       [0.  , 1.  , 0.  , 2.  , 1.  , 0.  , 0.  , 4.  ],
       [0.  , 0.  , 1.  , 1.36, 0.  , 0.  , 1.  , 3.  ],
       [0.  , 0.  , 1.  , 2.  , 0.  , 0.  , 1.  , 2.  ],
       [0.  , 0.  , 1.  , 1.68,

In [50]:
## SVR -- Support Vector Regression
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train, y_train)

In [51]:
y_pred = svr.predict(X_test)

In [52]:
from sklearn.metrics import r2_score, mean_absolute_error

print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))

0.46028114561159283
4.1486423210190235


# Hyperparameters Turning for SVR by using GridSearchCV

In [53]:
from sklearn.model_selection import RandomizedSearchCV

# defining parameter range
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

In [54]:
random = RandomizedSearchCV(estimator=svr, param_distributions=param_grid, cv=5, verbose=3, n_jobs=-1)

### Fitting the model for grid search
random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [55]:
random.best_params_

{'kernel': 'rbf', 'gamma': 0.001, 'C': 100}

In [56]:
random.best_score_

0.491114987695256

In [57]:
from sklearn.metrics import r2_score, mean_absolute_error

print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))

0.46028114561159283
4.1486423210190235
