In [1]:
# dataframe handling
import numpy as np
import pandas as pd

# warning handling
import warnings

# importing os library
import os

# visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# ML Models
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier,ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# disabling warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# getting CSV file location
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        src = os.path.join(dirname, filename)

In [4]:
# reading the source file
df = pd.read_csv(src)

In [5]:
# viewing data
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [6]:
df.shape

(2111, 17)

In [7]:
# renaming columns
df.columns = ['Gender', 'Age', 'Height', 'Weight', 'FHO', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'Class']

In [8]:
# encoding labels
columns = ['Gender', 'FHO', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'Class']

label_encoder = LabelEncoder()

for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = label_encoder.fit_transform(df[column])

In [9]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,FHO,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Class
0,0,21.0,1.62,64.0,1,0,2.0,3.0,2,0,2.0,0,0.0,1.0,3,3,1
1,0,21.0,1.52,56.0,1,0,3.0,3.0,2,1,3.0,1,3.0,0.0,2,3,1
2,1,23.0,1.8,77.0,1,0,2.0,3.0,2,0,2.0,0,2.0,1.0,1,3,1
3,1,27.0,1.8,87.0,0,0,3.0,3.0,2,0,2.0,0,2.0,0.0,1,4,5
4,1,22.0,1.78,89.8,0,0,2.0,1.0,2,0,2.0,0,0.0,0.0,2,3,6


In [10]:
X = df.drop('Class', axis = 1)
y = df["Class"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Normalize features using StandardScaler
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

## **Logistic Regression**

In [12]:
# training model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# making classifications
lr_y_pred = lr_model.predict(X_test)

# checking score
LR = accuracy_score(y_test, lr_y_pred)

# printing Classification Report
print(classification_report(y_test, lr_y_pred))

              precision    recall  f1-score   support

           0       0.74      0.93      0.83        56
           1       0.53      0.42      0.47        62
           2       0.58      0.60      0.59        78
           3       0.82      0.84      0.83        58
           4       0.90      1.00      0.95        63
           5       0.54      0.38      0.44        56
           6       0.35      0.38      0.37        50

    accuracy                           0.65       423
   macro avg       0.64      0.65      0.64       423
weighted avg       0.64      0.65      0.64       423



- **Logistic Regression with Hyperparametric Tuning**

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the parameter grid
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization penalty
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'solver': ['liblinear', 'saga'],  # Solver algorithm
    'max_iter': [100, 200, 300]  # Maximum number of iterations
}

# Create an instance of the Logistic Regression model
lr_model = LogisticRegression()

# Create Grid Search CV instance
grid_search = GridSearchCV(estimator=lr_model, param_grid=param_grid, cv = 5, scoring='accuracy', verbose=1)

# Fit the Grid Search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Use the best model for prediction
best_lr_model = grid_search.best_estimator_
lr_y_pred = best_lr_model.predict(X_test)

# Check the accuracy
LR = accuracy_score(y_test, lr_y_pred)

# Print Classification Report
print(classification_report(y_test, lr_y_pred))

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'C': 10, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score: 0.7725212016926237
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        56
           1       0.83      0.47      0.60        62
           2       0.69      0.71      0.70        78
           3       0.84      0.98      0.90        58
           4       1.00      1.00      1.00        63
           5       0.61      0.66      0.63        56
           6       0.50      0.58      0.54        50

    accuracy                           0.77       423
   macro avg       0.78      0.77      0.76       423
weighted avg       0.78      0.77      0.77       423



## **Support Vector Machine**

In [14]:
# creating an instance of SVM class
svm_model = SVC()
svm_model.fit(X_train, y_train)

# maing predictions
svm_y_pred = svm_model.predict(X_test)

# checking score
SVM = accuracy_score(y_test, svm_y_pred)

# printing Classification Report
print(classification_report(y_test, svm_y_pred))

              precision    recall  f1-score   support

           0       0.71      0.88      0.78        56
           1       0.48      0.34      0.40        62
           2       0.65      0.33      0.44        78
           3       0.77      0.41      0.54        58
           4       0.56      1.00      0.72        63
           5       0.47      0.48      0.47        56
           6       0.43      0.58      0.49        50

    accuracy                           0.57       423
   macro avg       0.58      0.57      0.55       423
weighted avg       0.59      0.57      0.54       423



In [15]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'gamma': ['scale', 'auto', 0.1, 0.01],  # Kernel coefficient
    'kernel': ['linear', 'rbf'],  # Kernel type
    'degree': [2, 3],  # Degree of the polynomial kernel
    'coef0': [0.0, 0.1]  # Independent term in the polynomial kernel equation
}

# Create an instance of the SVC model
svm_model = SVC()

# Create Grid Search CV instance
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Fit the Grid Search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Use the best model for prediction
best_svm_model = grid_search.best_estimator_
svm_y_pred = best_svm_model.predict(X_test)

# Check the accuracy
SVM = accuracy_score(y_test, svm_y_pred)

# Print Classification Report
print(classification_report(y_test, svm_y_pred))

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best Parameters: {'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
Best Score: 0.9431338120906713
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        56
           1       0.97      0.90      0.93        62
           2       0.99      0.96      0.97        78
           3       0.95      1.00      0.97        58
           4       1.00      1.00      1.00        63
           5       0.91      0.91      0.91        56
           6       0.94      0.96      0.95        50

    accuracy                           0.96       423
   macro avg       0.96      0.96      0.96       423
weighted avg       0.96      0.96      0.96       423



## **K-Nearest Neighbours**

In [16]:
# building a function to test best neighbour 
def KNN(k):
    # building a KNN model (default k=5)
    knn_model = KNeighborsClassifier(n_neighbors=k)

    # fitting the model to the training data
    knn_model.fit(X_train, y_train)

    # making predictions on the test set
    y_pred_knn = knn_model.predict(X_test)

    # evaluating the model's accuracy
    accuracy_knn = accuracy_score(y_test, y_pred_knn)
    return accuracy_knn

# finding the best n value
k_values = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

best_accuracy = 0
best_k = None

for k in k_values:
    accuracy = KNN(k)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k

print("Best Accuracy: {0:0.5f} for K = {1}".format(best_accuracy, best_k))

KNN = best_accuracy

Best Accuracy: 0.88180 for K = 5


- **KNN with Hyperparameter Tuning**

In [17]:
def knn_hyperparameter_tuning(X_train, X_test, y_train, y_test):
    # Define the parameter grid
    param_grid = {
        'n_neighbors': [3, 4, 5, 6, 7],  # Number of neighbors
        'weights': ['uniform', 'distance'],  # Weight function used in prediction
        'algorithm': ['auto', 'ball_tree'],  # Algorithm used to compute the nearest neighbors
        'p': [1, 2],  # Power parameter for the Minkowski metric
        'leaf_size': [10, 20, 30],  # Leaf size passed to BallTree or KDTree
    }

    # Create an instance of the KNN model
    knn_model = KNeighborsClassifier()

    # Create Grid Search CV instance
    grid_search = GridSearchCV(estimator=knn_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

    # Fit the Grid Search to the data
    grid_search.fit(X_train, y_train)

    # Get the best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Print the best parameters and best score
    print("Best Parameters:", best_params)
    print("Best Score:", best_score)

    # Use the best model for prediction
    best_knn_model = grid_search.best_estimator_
    knn_y_pred = best_knn_model.predict(X_test)

    # Check the accuracy
    knn_accuracy = accuracy_score(y_test, knn_y_pred)

    # Print Classification Report
    print(classification_report(y_test, knn_y_pred))

    return best_knn_model, knn_y_pred, knn_accuracy

# Example usage:
# Assuming X_train, X_test, y_train, y_test are already defined
best_knn_model, knn_y_pred, knn_accuracy = knn_hyperparameter_tuning(X_train_normalized, X_test_normalized, y_train, y_test)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Best Parameters: {'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 4, 'p': 1, 'weights': 'distance'}
Best Score: 0.8779590188400963
              precision    recall  f1-score   support

           0       0.83      0.95      0.88        56
           1       0.81      0.63      0.71        62
           2       0.89      0.91      0.90        78
           3       0.97      0.98      0.97        58
           4       0.98      1.00      0.99        63
           5       0.87      0.80      0.83        56
           6       0.73      0.82      0.77        50

    accuracy                           0.87       423
   macro avg       0.87      0.87      0.87       423
weighted avg       0.87      0.87      0.87       423



## **Decision Tree Classifier**

In [18]:
# Creating an instance of Decision Tree Classifier
dt_model = DecisionTreeClassifier(max_depth = 10)

# Fitting the model to the training data
dt_model.fit(X_train, y_train)

# Making predictions
dt_y_pred = dt_model.predict(X_test)

# Checking accuracy
DTC = accuracy_score(y_test, dt_y_pred)

# Printing Classification Report
print(classification_report(y_test, dt_y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95        56
           1       0.88      0.90      0.89        62
           2       0.96      0.91      0.93        78
           3       0.93      0.95      0.94        58
           4       1.00      1.00      1.00        63
           5       0.93      0.89      0.91        56
           6       0.94      0.96      0.95        50

    accuracy                           0.94       423
   macro avg       0.94      0.94      0.94       423
weighted avg       0.94      0.94      0.94       423



In [19]:
# creating model class
dt_model = DecisionTreeClassifier(max_depth = 12)

# using boosting filter
dt_ada_model = AdaBoostClassifier(estimator=dt_model, n_estimators=50, learning_rate=0.75, random_state=42, algorithm = "SAMME")

# fitting model with training data
dt_ada_model.fit(X_train, y_train)

# make predictions
dt_y_pred = dt_ada_model.predict(X_test)

# getting accuracy score
DTC_ADA = accuracy_score(y_test, dt_y_pred)

# printing accuracy report
print(classification_report(y_test, dt_y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95        56
           1       0.88      0.90      0.89        62
           2       0.97      0.92      0.95        78
           3       0.95      0.95      0.95        58
           4       1.00      1.00      1.00        63
           5       0.93      0.91      0.92        56
           6       0.94      0.96      0.95        50

    accuracy                           0.94       423
   macro avg       0.94      0.94      0.94       423
weighted avg       0.94      0.94      0.94       423

