In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df=sns.load_dataset('tips')
df.head(3)


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


In [3]:
x=df.drop(['total_bill'], axis=1)
y=df['total_bill']

## Feature encoding

In [4]:
x.isnull().sum()

tip       0
sex       0
smoker    0
day       0
time      0
size      0
dtype: int64

In [5]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   tip     244 non-null    float64 
 1   sex     244 non-null    category
 2   smoker  244 non-null    category
 3   day     244 non-null    category
 4   time    244 non-null    category
 5   size    244 non-null    int64   
dtypes: category(4), float64(1), int64(1)
memory usage: 5.5 KB


In [6]:
from sklearn.preprocessing import LabelEncoder
labelenc=LabelEncoder()


In [7]:
print(x['sex'].unique())
print(x['smoker'].unique())
print(x['time'].unique())
print(x['day'].unique())

['Female', 'Male']
Categories (2, object): ['Male', 'Female']
['No', 'Yes']
Categories (2, object): ['Yes', 'No']
['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']
['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']


In [8]:
x['sex']=labelenc.fit_transform(x['sex'])
x['smoker']=labelenc.fit_transform(x['smoker'])
x['time']=labelenc.fit_transform(x['time'])

In [9]:
x.head()

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,0,0,Sun,0,2
1,1.66,1,0,Sun,0,3
2,3.5,1,0,Sun,0,3
3,3.31,1,0,Sun,0,2
4,3.61,0,0,Sun,0,4


In [10]:
x['day']=x['day'].map({'Thur':0,'Fri':1,'Sat':2,'Sun':3})

In [11]:
x.head(3)

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,0,0,3,0,2
1,1.66,1,0,3,0,3
2,3.5,1,0,3,0,3


In [12]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   tip     244 non-null    float64 
 1   sex     244 non-null    int32   
 2   smoker  244 non-null    int32   
 3   day     244 non-null    category
 4   time    244 non-null    int32   
 5   size    244 non-null    int64   
dtypes: category(1), float64(1), int32(3), int64(1)
memory usage: 7.2 KB


In [13]:
x['day']=x['day'].astype(int)

In [14]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tip     244 non-null    float64
 1   sex     244 non-null    int32  
 2   smoker  244 non-null    int32  
 3   day     244 non-null    int32  
 4   time    244 non-null    int32  
 5   size    244 non-null    int64  
dtypes: float64(1), int32(4), int64(1)
memory usage: 7.8 KB


In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
from sklearn.svm import SVR
svr=SVR()
svr.fit(x_train, y_train)
y_pred=svr.predict(x_test)



In [16]:
# peerformance metrics
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 39.5922777724737
R^2 Score: 0.5330468488705358


## hyperparameter tuning


In [23]:
# import grid search for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
param_grid = {
    'kernel': ['rbf'],
    'C': [0.1, 1, 10],
    'gamma':[1, 0.1, 0.01]}
grid = GridSearchCV(SVR(), param_grid, cv=3, refit=True, verbose=3)

In [24]:
grid.fit(x_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 1/3] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.038 total time=   0.0s
[CV 2/3] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.011 total time=   0.0s
[CV 3/3] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.101 total time=   0.0s
[CV 1/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.039 total time=   0.0s
[CV 2/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.063 total time=   0.0s
[CV 3/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.004 total time=   0.0s
[CV 1/3] END ....C=0.1, gamma=0.01, kernel=rbf;, score=-0.035 total time=   0.0s
[CV 2/3] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.005 total time=   0.0s
[CV 3/3] END ....C=0.1, gamma=0.01, kernel=rbf;, score=-0.077 total time=   0.0s
[CV 1/3] END ..........C=1, gamma=1, kernel=rbf;, score=0.174 total time=   0.0s
[CV 2/3] END ..........C=1, gamma=1, kernel=rbf;, score=0.171 total time=   0.0s
[CV 3/3] END ..........C=1, gamma=1, kernel=rbf;,

In [25]:
grid.best_params_

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

In [26]:
grid_pred=grid.predict(x_test)


In [28]:
# peerformance metrics for grid search
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, grid_pred)
r2 = r2_score(y_test, grid_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 36.111499816417
R^2 Score: 0.5740993047131414
