In [1]:
import seaborn as sns
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [2]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['time']=encoder.fit_transform(df['time'])

In [3]:
## independent and dependent feature
X=df.drop(labels=['time'],axis=1)
y=df.time

In [4]:
X['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer ## Handle Missing Values
from sklearn.preprocessing import StandardScaler ## Feature Scaling
from sklearn.preprocessing import OneHotEncoder ## categorical to numerical
from sklearn.compose import ColumnTransformer

In [7]:
X

Unnamed: 0,total_bill,tip,sex,smoker,day,size
0,16.99,1.01,Female,No,Sun,2
1,10.34,1.66,Male,No,Sun,3
2,21.01,3.50,Male,No,Sun,3
3,23.68,3.31,Male,No,Sun,2
4,24.59,3.61,Female,No,Sun,4
...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,3
240,27.18,2.00,Female,Yes,Sat,2
241,22.67,2.00,Male,Yes,Sat,2
242,17.82,1.75,Male,No,Sat,2


In [8]:
categorical_cols = ['sex', 'smoker','day']
numerical_cols = ['total_bill', 'tip','size']

In [9]:
# Feature Engineering Automation -> PIPELINE
num_pipeline = Pipeline(
    steps = [
#         imputer handles NAN values
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ])

cat_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('one_hot_encoder' , OneHotEncoder())
    ])

In [10]:
preprocessor = ColumnTransformer([
    ('num_pipeline' , num_pipeline , numerical_cols),
    ('cat_pipeline' , cat_pipeline , categorical_cols)
])

In [11]:
preprocessor

In [12]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [14]:
# MODEL TRAINING AUTOMATION
models = {
    'Random Forest':RandomForestClassifier(),
    'Logistric Regression' : LogisticRegression(),
    'Decision Tree' : DecisionTreeClassifier()
}

In [15]:
from sklearn.metrics import accuracy_score

In [16]:
def evaluate_model(X_train,y_train,X_test,y_test,models):

    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        # Train model
        model.fit(X_train,y_train)

        
        # Predict Testing data
        y_test_pred =model.predict(X_test)

        # Get accuracy for test data prediction

        test_model_score = accuracy_score(y_test,y_test_pred)

        report[list(models.keys())[i]] =  test_model_score
        
        
    return report

In [17]:
evaluate_model(X_train,y_train,X_test,y_test,models)

{'Random Forest': 0.9591836734693877,
 'Logistric Regression': 1.0,
 'Decision Tree': 0.9387755102040817}

In [18]:
classfier=RandomForestClassifier()

In [19]:
## Hypeparameter Tuning
params={'max_depth':[3,5,10,None],
              'n_estimators':[100,200,300],
               'criterion':['gini','entropy']
              }

In [20]:
from sklearn.model_selection import RandomizedSearchCV

In [21]:
cv=RandomizedSearchCV(classfier,param_distributions=params,scoring='accuracy',cv=5,verbose=3)

In [22]:
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=entropy, max_depth=10, n_estimators=300;, score=0.974 total time=   0.2s
[CV 2/5] END criterion=entropy, max_depth=10, n_estimators=300;, score=0.923 total time=   0.2s
[CV 3/5] END criterion=entropy, max_depth=10, n_estimators=300;, score=1.000 total time=   0.2s
[CV 4/5] END criterion=entropy, max_depth=10, n_estimators=300;, score=0.949 total time=   0.2s
[CV 5/5] END criterion=entropy, max_depth=10, n_estimators=300;, score=0.923 total time=   0.2s
[CV 1/5] END criterion=gini, max_depth=5, n_estimators=100;, score=0.974 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=5, n_estimators=100;, score=0.923 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=5, n_estimators=100;, score=1.000 total time=   0.1s
[CV 4/5] END criterion=gini, max_depth=5, n_estimators=100;, score=0.949 total time=   0.1s
[CV 5/5] END criterion=gini, max_depth=5, n_estimators=100;, score=0.949 total time=   0.1s

In [23]:
cv.best_params_

{'n_estimators': 100, 'max_depth': 5, 'criterion': 'gini'}

In [24]:
accuracy_score(y_test , cv.predict(X_test))

0.9795918367346939

## Internal Assignment 

In [25]:
import seaborn as sns
import pandas as pd

# 1. Load the dataset 
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [26]:
# 2. Use label Encoding for time column
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['time']=encoder.fit_transform(df['time'])

In [27]:
# 3. Separate features (X) and target variable (y)
X = df.drop(columns=['total_bill'])  # Independent features
y = df['total_bill']  # Target variable

In [28]:
X

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,Female,No,Sun,0,2
1,1.66,Male,No,Sun,0,3
2,3.50,Male,No,Sun,0,3
3,3.31,Male,No,Sun,0,2
4,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...
239,5.92,Male,No,Sat,0,3
240,2.00,Female,Yes,Sat,0,2
241,2.00,Male,Yes,Sat,0,2
242,1.75,Male,No,Sat,0,2


In [29]:
y

0      16.99
1      10.34
2      21.01
3      23.68
4      24.59
       ...  
239    29.03
240    27.18
241    22.67
242    17.82
243    18.78
Name: total_bill, Length: 244, dtype: float64

In [30]:
# 4. Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33)

In [31]:
# 5. Pipelining now 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer ## Handle Missing Values
from sklearn.preprocessing import StandardScaler ## Feature Scaling
from sklearn.preprocessing import OneHotEncoder ## categorical to numerical
from sklearn.compose import ColumnTransformer ## To make pipeline work

In [32]:
# 6. Define numerical and categorical columns
categorical_cols = ['sex', 'smoker','day']
numerical_cols = ['tip','size']

In [33]:
# 7. Create transformers

# Numerical Pipeline
num_pipeline = Pipeline(
    steps = [
        ('imputer' , SimpleImputer(strategy='median')),
        ('scaler' , StandardScaler())
    ]
)

# Categorical Pipeline
cat_pipeline = Pipeline(
    steps = [
        ('imputer' , SimpleImputer(strategy='most_frequent')),
        ('onehotencoder' , OneHotEncoder())
    ]
)

In [34]:
# 8. Combine transformers
preprocessor = ColumnTransformer([
    ('num_pipeline' , num_pipeline , numerical_cols),
    ('cat_pipeline' , cat_pipeline , categorical_cols)
])

In [35]:
preprocessor

In [36]:
# 9. Preprocess training and testing data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [37]:
X_train.shape

(163, 10)

In [38]:
X_test.shape

(81, 10)

In [39]:
y_train.shape

(163,)

In [40]:
# 10. Train RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

In [41]:
model = RandomForestRegressor()

In [42]:
model.fit(X_train , y_train)

In [43]:
from sklearn.metrics import r2_score

In [44]:
y_pred = model.predict(X_test)
score = r2_score(y_test , y_pred)

In [45]:
print("R2 Score:", score)

R2 Score: 0.44974017364145935
