In [102]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [159]:
df=sns.load_dataset('tips')

In [161]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [163]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [165]:
df_cat = df[['sex','smoker','day','time']]

In [167]:
for i in df_cat:
    count = df[i].value_counts()
    print(i ,':' ,count)

sex : sex
Male      157
Female     87
Name: count, dtype: int64
smoker : smoker
No     151
Yes     93
Name: count, dtype: int64
day : day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64
time : time
Dinner    176
Lunch      68
Name: count, dtype: int64


# Independent and dependent features

In [170]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [172]:
X=df[['tip', 'sex', 'smoker', 'day', 'time', 'size']]
y=df['total_bill']

## Split the data

In [175]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

## Label Encoding

In [178]:
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()

In [180]:
X_train['sex']=le1.fit_transform(X_train['sex'])
X_train['smoker']=le2.fit_transform(X_train['smoker'])
X_train['time']=le3.fit_transform(X_train['time'])

In [182]:
X_test['sex']=le1.transform(X_test['sex'])
X_test['smoker']=le2.transform(X_test['smoker'])
X_test['time']=le3.transform(X_test['time'])

In [184]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
228,2.72,1,0,Sat,0,2
208,2.03,1,1,Sat,0,2
96,4.0,1,1,Fri,0,2
167,4.5,1,0,Sun,0,4
84,2.03,1,0,Thur,1,2


In [186]:
X_test.head()

Unnamed: 0,tip,sex,smoker,day,time,size
24,3.18,1,0,Sat,0,2
6,2.0,1,0,Sun,0,2
153,2.0,1,0,Sun,0,4
211,5.16,1,1,Sat,0,4
198,2.0,0,1,Thur,1,2


## Standardization : end of notebook

# OneHotEncoding

In [146]:
ct=ColumnTransformer(transformers=[('onehot', OneHotEncoder(drop='first'),[3])] ,remainder='passthrough')
# 3 car la variable day que nous voulons encoder est à l'index de colonne 3
# remainders permet de laisser passer les autres colonnes sans les modifer

In [148]:
ct

In [150]:
np.set_printoptions(threshold=sys.maxsize)
ct.fit_transform(X_train)

array([[ 1.  ,  0.  ,  0.  ,  2.72,  1.  ,  0.  ,  0.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  ,  2.03,  1.  ,  1.  ,  0.  ,  2.  ],
       [ 0.  ,  0.  ,  0.  ,  4.  ,  1.  ,  1.  ,  0.  ,  2.  ],
       [ 0.  ,  1.  ,  0.  ,  4.5 ,  1.  ,  0.  ,  0.  ,  4.  ],
       [ 0.  ,  0.  ,  1.  ,  2.03,  1.  ,  0.  ,  1.  ,  2.  ],
       [ 0.  ,  1.  ,  0.  ,  3.51,  1.  ,  0.  ,  0.  ,  2.  ],
       [ 0.  ,  0.  ,  1.  ,  2.  ,  1.  ,  0.  ,  1.  ,  2.  ],
       [ 0.  ,  1.  ,  0.  ,  5.15,  1.  ,  1.  ,  0.  ,  2.  ],
       [ 1.  ,  0.  ,  0.  ,  1.75,  1.  ,  0.  ,  0.  ,  2.  ],
       [ 0.  ,  1.  ,  0.  ,  2.74,  1.  ,  0.  ,  0.  ,  3.  ],
       [ 1.  ,  0.  ,  0.  ,  3.  ,  1.  ,  1.  ,  0.  ,  3.  ],
       [ 0.  ,  1.  ,  0.  ,  5.14,  0.  ,  0.  ,  0.  ,  5.  ],
       [ 0.  ,  1.  ,  0.  ,  3.5 ,  0.  ,  0.  ,  0.  ,  2.  ],
       [ 0.  ,  1.  ,  0.  ,  5.65,  1.  ,  1.  ,  0.  ,  2.  ],
       [ 0.  ,  0.  ,  0.  ,  2.5 ,  0.  ,  1.  ,  1.  ,  2.  ],
       [ 1.  ,  0.  ,  0.

In [152]:
X_train=ct.fit_transform(X_train)
X_test=ct.fit_transform(X_test)

## SVR

In [155]:
svr=SVR()
svr.fit(X_train,y_train)
y_pred=svr.predict(X_test)

In [157]:
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.5502250005312896
4.413774883497757


In [47]:
y_test.mean()

18.067755102040817

## Hyperparameters Tuning using GridSearchCV

In [50]:
param_grid= {'C' : [0.1,1,10,100,1000],                # petit:tolérant, grand:moins tolérant
             'gamma' : [1,0.1,0.01,0.001,0.0001],      # ppetit: plus general, grand:plus precis, plus surajusté
             'kernel' : ['rbf']}

In [52]:
grid = GridSearchCV(SVR(), param_grid=param_grid, refit=True, verbose=3)
# refit : après avoir trouvé la meilleur combinaison, refait le modèle avec les hyperparamètres choisis et réentraine sur X_train
grid.fit(X_train,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.119 total time=   0.0s
[CV 2/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.014 total time=   0.0s
[CV 3/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.017 total time=   0.0s
[CV 4/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.019 total time=   0.0s
[CV 5/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.042 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.013 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.099 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.080 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.080 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.049 total time=   0.0s
[CV 1/5] END ....C=0.1, gamma=0.01, kernel=rbf;, score=-0.109 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [53]:
y_pred=grid.predict(X_test)

In [59]:
grid.best_params_

{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}

In [63]:
grid.best_score_

0.5218580414047438

In [65]:
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.6200659646053837
4.129773285984329


In [67]:
y_test.mean()

18.067755102040817

# With standardization

In [188]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
228,2.72,1,0,Sat,0,2
208,2.03,1,1,Sat,0,2
96,4.0,1,1,Fri,0,2
167,4.5,1,0,Sun,0,4
84,2.03,1,0,Thur,1,2


In [190]:
ct = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [0]),          
        ('cat', OneHotEncoder(drop='first'), [3])  # ex: colonne catégorielle
    ],remainder='passthrough')


X_train=ct.fit_transform(X_train)
X_test=ct.transform(X_test)

In [194]:
X_train

array([[-2.58032900e-01,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  2.00000000e+00],
       [-7.42114421e-01,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  2.00000000e+00],
       [ 6.39973400e-01,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  2.00000000e+00],
       [ 9.90757111e-01,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  4.00000000e+00],
       [-7.42114421e-01,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  2.00000000e+00],
       [ 2.96205363e-01,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  2.00000000e+00],
       [-7.63161444e-01,  0.000000

In [196]:
ct.named_transformers_['cat'].get_feature_names_out()


array(['day_Sat', 'day_Sun', 'day_Thur'], dtype=object)

In [198]:
param_grid= {'C' : [0.1,1,10,100,1000],                # petit:tolérant, grand:moins tolérant
             'gamma' : [1,0.1,0.01,0.001,0.0001],      # ppetit: plus general, grand:plus precis, plus surajusté
             'kernel' : ['rbf']}

In [200]:
grid = GridSearchCV(SVR(), param_grid=param_grid, refit=True, verbose=3)
# refit : après avoir trouvé la meilleur combinaison, refait le modèle avec les hyperparamètres choisis et réentraine sur X_train
grid.fit(X_train,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.102 total time=   0.0s
[CV 2/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.012 total time=   0.0s
[CV 3/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.013 total time=   0.0s
[CV 4/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.014 total time=   0.0s
[CV 5/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.036 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.1, kernel=rbf;, score=-0.001 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.084 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.067 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.083 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.040 total time=   0.0s
[CV 1/5] END ....C=0.1, gamma=0.01, kernel=rbf;, score=-0.120 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=0.01, kernel=rbf;

In [202]:
y_pred=grid.predict(X_test)

In [204]:
grid.best_params_

{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}

In [206]:
grid.best_score_

0.517616539524385

In [208]:
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.598887166463305
4.222239476711429
