In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = sns.load_dataset("tips")

In [3]:
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
data.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [5]:
data["day"].unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [6]:
data["time"].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [7]:
data.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [8]:
from sklearn.preprocessing import LabelEncoder
#we will use Lable Encoding on another catigorical 
encoder = LabelEncoder()
cato = ["time"]

for i in cato:
    data[i] = encoder.fit_transform(data[i])

In [9]:
data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.50,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.00,Female,Yes,Sat,0,2
241,22.67,2.00,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2


In [10]:
x = data.drop("time",axis=1)
y = data["time"]

In [11]:
x["day"].value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [12]:
numerical_features = x.select_dtypes(exclude="category").columns
catigorical_features = x.select_dtypes(include="category").columns
print(numerical_features)
print(catigorical_features)

Index(['total_bill', 'tip', 'size'], dtype='object')
Index(['sex', 'smoker', 'day'], dtype='object')


In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=42)

In [14]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(195, 6)
(49, 6)
(195,)
(49,)


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [20]:
num_pipline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)

cato_pipline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("onehot",OneHotEncoder()),
        ("scaler",StandardScaler(with_mean=False))
    ]
)

preprocessor = ColumnTransformer([
    ("num_pipline",num_pipline,numerical_features),
    ("cato_pipline",cato_pipline,catigorical_features)
])

In [22]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [30]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [41]:
models={
    "RandomForestClassifier":RandomForestClassifier(oob_score=True),
    "LogisticRegression":LogisticRegression(),
    "DecisionTreeClassifier":DecisionTreeClassifier(),  
}

In [42]:
def evalueat_model(X_train,y_train,X_test,y_test,models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        model.fit(X_train,y_train)
        print(model.score(X_train,y_train))
        
        y_prad = model.predict(X_test)
        
        accuracy = accuracy_score(y_test,y_prad)*100 
        
        report[list(models.keys())[i]] = accuracy
        
    return report

In [43]:
evalueat_model(X_train,y_train,X_test,y_test,models)

1.0
0.9692307692307692
1.0


{'RandomForestClassifier': 95.91836734693877,
 'LogisticRegression': 97.95918367346938,
 'DecisionTreeClassifier': 93.87755102040816}

In [58]:
params = {
    "n_estimators":[50,100,200],
    "criterion":["gini","entropy"],
    "max_depth":[3,5,10]
}

In [59]:
from sklearn.model_selection import RandomizedSearchCV

In [60]:
forest = RandomForestClassifier()

In [61]:
cv = RandomizedSearchCV(forest,param_distributions=params,
                   scoring="accuracy",cv=5,verbose=3)

In [62]:
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.974 total time=   0.5s
[CV 2/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.923 total time=   0.5s
[CV 3/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=1.000 total time=   0.5s
[CV 4/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.949 total time=   0.5s
[CV 5/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.923 total time=   0.5s
[CV 1/5] END criterion=entropy, max_depth=10, n_estimators=50;, score=0.974 total time=   0.1s
[CV 2/5] END criterion=entropy, max_depth=10, n_estimators=50;, score=0.923 total time=   0.0s
[CV 3/5] END criterion=entropy, max_depth=10, n_estimators=50;, score=1.000 total time=   0.1s
[CV 4/5] END criterion=entropy, max_depth=10, n_estimators=50;, score=0.923 total time=   0.1s
[CV 5/5] END criterion=entropy, max_depth=10, n_estimators=50;, score=0.923 tot

In [63]:
cv.best_params_

{'n_estimators': 200, 'max_depth': 10, 'criterion': 'entropy'}

In [71]:
forest1 = RandomForestClassifier(n_estimators= 200, max_depth= 10, criterion= 'entropy')

In [72]:
forest1.fit(X_train,y_train)

In [73]:
y_prad = forest1.predict(X_test)

In [75]:
forest1.score(X_train,y_train)

1.0

In [78]:
accuracy_score(y_test,y_prad)

0.9591836734693877