In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = sns.load_dataset('tips')

In [4]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [6]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [7]:
# time is target varibale .. when he/she is visiting

In [8]:
df.time.unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [9]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [10]:
df.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [11]:
# sex           category converting it to integert using encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder

In [12]:
df['time'] = encoder.fit_transform(df['time'])

In [13]:
df['time']

0      0
1      0
2      0
3      0
4      0
      ..
239    0
240    0
241    0
242    0
243    0
Name: time, Length: 244, dtype: int32

In [14]:
from sklearn.model_selection import train_test_split 
x = df.drop('time',axis =1 )
y = df['time']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

In [15]:
x_train.shape , x_test.shape

((170, 6), (74, 6))

In [16]:
from sklearn.impute import SimpleImputer # for missing value treatement
from sklearn.preprocessing import StandardScaler # for scaling everything ( for numerical col )
from sklearn.preprocessing import OneHotEncoder # for encoding 
from sklearn.pipeline import Pipeline # a sequence of data transformer
from sklearn.compose import ColumnTransformer # groups all pipeline step for each of the columns

In [17]:
df.sample(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
169,10.63,2.0,Female,Yes,Sat,0,2


In [56]:
cat_col = ['sex','smoker','day']
num_col = ['total_bill' , 'tip','size']

In [58]:
df[num_col]

Unnamed: 0,total_bill,tip,size
0,16.99,1.01,2
1,10.34,1.66,3
2,21.01,3.50,3
3,23.68,3.31,2
4,24.59,3.61,4
...,...,...,...
239,29.03,5.92,3
240,27.18,2.00,2
241,22.67,2.00,2
242,17.82,1.75,2


In [60]:
df[cat_col]

Unnamed: 0,sex,smoker,day
0,Female,No,Sun
1,Male,No,Sun
2,Male,No,Sun
3,Male,No,Sun
4,Female,No,Sun
...,...,...,...
239,Male,No,Sat
240,Female,Yes,Sat
241,Male,Yes,Sat
242,Male,No,Sat


In [62]:
# feature enginering automation using pipeline and transformers

In [72]:
cat_pipeline = Pipeline(steps = [
                                ('Imputation' , SimpleImputer(strategy= 'most_frequent')) , 
                                ('OneHotEncoding',OneHotEncoder()) ,
                                ] )
num_pipeline = Pipeline(steps = [
                                ('Imputation' , SimpleImputer(strategy= 'median')) ,
                                ('Scaling' , StandardScaler()) ] )

In [74]:
col_transformer = ColumnTransformer([
                                    ("num_pipeline" , num_pipeline , num_col),
                                    ("cat_pipeline" , cat_pipeline , cat_col)
                                    ])

In [76]:
col_transformer

In [78]:
x_train = col_transformer.fit_transform(x_train)
x_test = col_transformer.fit_transform(x_test)

In [80]:
# now data is ready now lets make multiple models

In [82]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [84]:
models = {"Support vector classifier" : SVC(),
          "Decision Tree Classifier" : DecisionTreeClassifier(),
          "Logistic Regression" : LogisticRegression()
         }

In [116]:
from sklearn.metrics import accuracy_score

def models_evaluation(x_train,x_test,y_train,y_test , models):
    evaluation = {}
    for i in range(len(models)):
        m = list(models.values())[i]
        m.fit(x_train,y_train)
        y_pred = m.predict(x_test)
        m_score = round(accuracy_score(y_test,y_pred)*100,2)
        evaluation[list(models.values())[i]] = m_score

    return evaluation
        

In [118]:
models_evaluation(x_train,x_test,y_train,y_test,models)

{SVC(): 95.95, DecisionTreeClassifier(): 97.3, LogisticRegression(): 95.95}

In [125]:
from sklearn.ensemble import RandomForestClassifier
RF  = RandomForestClassifier(n_estimators=10 ,criterion='accuracy_score') 
RF

In [129]:
from sklearn.model_selection import GridSearchCV
param =  { 'max_depth' : [1,4,8,10,None],
          'n_estimators' : [10,20,30,100,None],
          'criterion' : ['gini','entropy']}

In [139]:
clf = GridSearchCV(estimator = RF , param_grid  = param ,cv = 5 , verbose= 2, scoring = 'accuracy' )
clf

In [141]:
clf.fit(x_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END .......criterion=gini, max_depth=1, n_estimators=10; total time=   0.0s
[CV] END .......criterion=gini, max_depth=1, n_estimators=10; total time=   0.0s
[CV] END .......criterion=gini, max_depth=1, n_estimators=10; total time=   0.0s
[CV] END .......criterion=gini, max_depth=1, n_estimators=10; total time=   0.0s
[CV] END .......criterion=gini, max_depth=1, n_estimators=10; total time=   0.0s
[CV] END .......criterion=gini, max_depth=1, n_estimators=20; total time=   0.0s
[CV] END .......criterion=gini, max_depth=1, n_estimators=20; total time=   0.0s
[CV] END .......criterion=gini, max_depth=1, n_estimators=20; total time=   0.0s
[CV] END .......criterion=gini, max_depth=1, n_estimators=20; total time=   0.0s
[CV] END .......criterion=gini, max_depth=1, n_estimators=20; total time=   0.0s
[CV] END .......criterion=gini, max_depth=1, n_estimators=30; total time=   0.0s
[CV] END .......criterion=gini, max_depth=1, n_

In [143]:
clf.best_estimator_

In [145]:
y_pred = clf.best_estimator_.predict(x_test)

In [149]:
score = round(accuracy_score(y_test,y_pred)*100,2)

In [153]:
print("final model score is :", score)

final model score is : 95.95


In [155]:
# final model 
final_model = clf.best_estimator_

In [157]:
final_model

In [163]:
from sklearn.ensemble import RandomForestClassifier
RF2  = RandomForestClassifier(n_estimators=10 ,oob_score=True) 
RF2

In [165]:
RF2.fit(x_train,y_train)

In [173]:
RF2.oob_score_*100

95.88235294117648