In [12]:
# Nour Salah - 13001528
# Mazen Ahmed - 13005132
# Abdulrahman Farajallah - 13003482
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split




In [13]:
# Load dataset
df = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")

# Encode categorical columns
label_encoders = {}
for col in df.select_dtypes(include="object").columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

print("Categorical features encoded.")
df.head()


Categorical features encoded.


Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,21,1.62,64.0,1,0,2.0,3.0,2,0,2.0,0,0.0,1.0,3,3,1
1,0,21,1.52,56.0,1,0,3.0,3.0,2,1,3.0,1,3.0,0.0,2,3,1
2,1,23,1.8,77.0,1,0,2.0,3.0,2,0,2.0,0,2.0,1.0,1,3,1
3,1,27,1.8,87.0,0,0,3.0,3.0,2,0,2.0,0,2.0,0.0,1,4,5
4,1,22,1.78,89.8,0,0,2.0,1.0,2,0,2.0,0,0.0,0.0,2,3,6


In [14]:
# Separate features (X) and target (y)
X = df.drop('NObeyesdad', axis=1)  # X = everything except the target column
y = df['NObeyesdad']               # y = the target column

# Confirm the shape of X and y
print("Shape of X (features):", X.shape)
print("Shape of y (target):" , y.shape)



Shape of X (features): (2111, 16)
Shape of y (target): (2111,)


In [15]:

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Check sizes
print("Training feature set shape:", X_train.shape)
print("Testing feature set shape:", X_test.shape)
print("Training target set shape:", y_train.shape)
print("Testing target set shape:", y_test.shape)



Training feature set shape: (1688, 16)
Testing feature set shape: (423, 16)
Training target set shape: (1688,)
Testing target set shape: (423,)


In [16]:
# Import the necessary library
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize the KNN classifier with k=5
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn.fit(X_train, y_train)

# Predict on the test set
y_pred_knn = knn.predict(X_test)

# Evaluate the model
print("KNN Classification Report:")
print(classification_report(y_test, y_pred_knn))

print("KNN Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))


KNN Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.94      0.90        54
           1       0.88      0.62      0.73        58
           2       0.96      0.91      0.93        70
           3       0.92      0.93      0.93        60
           4       0.98      0.97      0.98        65
           5       0.80      0.88      0.84        58
           6       0.78      0.90      0.83        58

    accuracy                           0.88       423
   macro avg       0.88      0.88      0.88       423
weighted avg       0.89      0.88      0.88       423

KNN Confusion Matrix:
[[51  3  0  0  0  0  0]
 [ 7 36  0  0  0 11  4]
 [ 0  0 64  1  0  0  5]
 [ 0  0  1 56  1  0  2]
 [ 0  0  0  2 63  0  0]
 [ 1  2  0  0  0 51  4]
 [ 0  0  2  2  0  2 52]]


In [17]:
# Import Naive Bayes model
from sklearn.naive_bayes import GaussianNB

# Initialize the model
nb = GaussianNB()

# Train the model
nb.fit(X_train, y_train)

# Predict on the test set
y_pred_nb = nb.predict(X_test)

# Evaluate the model
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))

print("Naive Bayes Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))


Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.83      0.76        54
           1       0.67      0.41      0.51        58
           2       0.35      0.56      0.43        70
           3       0.54      0.93      0.68        60
           4       1.00      0.98      0.99        65
           5       0.63      0.29      0.40        58
           6       0.67      0.17      0.27        58

    accuracy                           0.60       423
   macro avg       0.65      0.60      0.58       423
weighted avg       0.65      0.60      0.58       423

Naive Bayes Confusion Matrix:
[[45  1  7  0  0  1  0]
 [18 24  8  0  0  6  2]
 [ 0  0 39 27  0  2  2]
 [ 0  1  2 56  0  0  1]
 [ 0  0  1  0 64  0  0]
 [ 2  6 29  4  0 17  0]
 [ 0  4 26 17  0  1 10]]


In [18]:
# Import Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Initialize and train the model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt.predict(X_test)

# Evaluate the model
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))

print("Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))


Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.83      0.90        54
           1       0.83      0.91      0.87        58
           2       0.96      0.93      0.94        70
           3       0.97      0.97      0.97        60
           4       1.00      0.98      0.99        65
           5       0.90      0.91      0.91        58
           6       0.90      0.97      0.93        58

    accuracy                           0.93       423
   macro avg       0.93      0.93      0.93       423
weighted avg       0.93      0.93      0.93       423

Decision Tree Confusion Matrix:
[[45  9  0  0  0  0  0]
 [ 1 53  0  0  0  4  0]
 [ 0  0 65  1  0  1  3]
 [ 0  0  2 58  0  0  0]
 [ 0  0  0  1 64  0  0]
 [ 0  2  0  0  0 53  3]
 [ 0  0  1  0  0  1 56]]


In [19]:
# Import SVM
from sklearn.svm import SVC

# Initialize the model
svm = SVC(kernel='linear')  # or use 'rbf' or other kernels as needed

# Train the model
svm.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm.predict(X_test)

# Evaluate the model
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

print("SVM Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))


SVM Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.93      0.88        54
           1       0.83      0.69      0.75        58
           2       0.89      0.93      0.91        70
           3       0.92      0.98      0.95        60
           4       1.00      0.98      0.99        65
           5       0.70      0.84      0.77        58
           6       0.89      0.69      0.78        58

    accuracy                           0.87       423
   macro avg       0.87      0.86      0.86       423
weighted avg       0.87      0.87      0.87       423

SVM Confusion Matrix:
[[50  4  0  0  0  0  0]
 [ 9 40  0  0  0  9  0]
 [ 0  0 65  3  0  1  1]
 [ 0  0  1 59  0  0  0]
 [ 0  0  0  1 64  0  0]
 [ 0  4  1  0  0 49  4]
 [ 0  0  6  1  0 11 40]]


In [11]:
# Import deep learning tools
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# If y is not one-hot encoded, encode it for deep learning
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Convert to categorical
y_train_cat = to_categorical(y_train_enc)
y_test_cat = to_categorical(y_test_enc)

# Create a simple deep learning model
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(y_train_cat.shape[1], activation='softmax')  # Output layer
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_cat, epochs=20, batch_size=32, verbose=1)

# Evaluate on test data
loss, accuracy = model.evaluate(X_test, y_test_cat)
print(f"Deep Learning Model Accuracy: {accuracy:.4f}")


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1917 - loss: 7.2262
Epoch 2/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2603 - loss: 1.6933
Epoch 3/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3348 - loss: 1.5365
Epoch 4/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3894 - loss: 1.4797
Epoch 5/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4104 - loss: 1.4148
Epoch 6/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5001 - loss: 1.3252
Epoch 7/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4758 - loss: 1.3145
Epoch 8/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4913 - loss: 1.2589
Epoch 9/20
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Store evaluation metrics in a dictionary
evaluation = {}

def evaluate(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted')
    rec = recall_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)

    evaluation[name] = {
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'Confusion Matrix': cm
    }

# Evaluate models
evaluate('KNN', y_test, y_pred_knn)
evaluate('Naive Bayes', y_test, y_pred_nb)
evaluate('Decision Tree', y_test, y_pred_dt)
evaluate('SVM', y_test, y_pred_svm)

# For deep learning model
y_pred_dl = model.predict(X_test)
y_pred_dl_classes = y_pred_dl.argmax(axis=1)
evaluate('Deep Learning (Keras)', y_test, y_pred_dl_classes)

# Display results
for model_name, metrics in evaluation.items():
    print(f"\n Evaluation for {model_name}")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}")
    print(f"Recall: {metrics['Recall']:.4f}")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 

 Evaluation for KNN
Accuracy: 0.8818
Precision: 0.8860
Recall: 0.8818
Confusion Matrix:
[[51  3  0  0  0  0  0]
 [ 7 36  0  0  0 11  4]
 [ 0  0 64  1  0  0  5]
 [ 0  0  1 56  1  0  2]
 [ 0  0  0  2 63  0  0]
 [ 1  2  0  0  0 51  4]
 [ 0  0  2  2  0  2 52]]

 Evaluation for Naive Bayes
Accuracy: 0.6028
Precision: 0.6452
Recall: 0.6028
Confusion Matrix:
[[45  1  7  0  0  1  0]
 [18 24  8  0  0  6  2]
 [ 0  0 39 27  0  2  2]
 [ 0  1  2 56  0  0  1]
 [ 0  0  1  0 64  0  0]
 [ 2  6 29  4  0 17  0]
 [ 0  4 26 17  0  1 10]]

 Evaluation for Decision Tree
Accuracy: 0.9314
Precision: 0.9344
Recall: 0.9314
Confusion Matrix:
[[45  9  0  0  0  0  0]
 [ 1 53  0  0  0  4  0]
 [ 0  0 65  1  0  1  3]
 [ 0  0  2 58  0  0  0]
 [ 0  0  0  1 64  0  0]
 [ 0  2  0  0  0 53  3]
 [ 0  0  1  0  0  1 56]]

 Evaluation for SVM
Accuracy: 0.8676
Precision: 0.8721
Recall: 0.8676
Confusion Matrix:
[[50  4  0  0  0  0  0]
 [ 9 40  0  0  0  9  

## Model Comparison & Final Decision

After evaluating all five models on the test set, we observed their performance across the following metrics:

| Model                  | Accuracy | Precision | Recall |
|------------------------|----------|-----------|--------|
| K-Nearest Neighbors    | 0.8818   | 0.8860    | 0.8818 |
| Naive Bayes            | 0.6028   | 0.6452    | 0.6028 |
| Decision Tree          | 0.9314   | 0.9344    | 0.9314 |
| Support Vector Machine | 0.8676   | 0.8721    | 0.8676 |
| Deep Learning (Keras)  | 0.6596   | 0.6531    | 0.6596 |

---

##**Best Performing Model: Decision Tree**
---

###Reasoning Behind the Choice:

- **Decision Tree** achieved the highest scores across all three key metrics (accuracy, precision, and recall), making it the most balanced and effective model overall for this classification task. It managed to capture complex patterns in the data without significant overfitting, and performed consistently well across all classes in the confusion matrix.

- **K-Nearest Neighbors (KNN)** also performed well with solid precision and recall. However, it showed slightly lower accuracy and a few misclassifications in cases where neighboring points were not representative (as seen in class 1 and 2).

- **Support Vector Machine (SVM)** showed decent results and good precision, but it slightly underperformed compared to the Decision Tree, especially in more imbalanced or confusing classes (as indicated by its confusion matrix).

- **Naive Bayes** underperformed due to its strong assumption of feature independence, which likely does not hold in this dataset. It struggled especially with multi-modal distributions.

- **Deep Learning (Keras)** had a modest performance in this task. While deep learning models are powerful, they typically require larger datasets and more hyperparameter tuning to outperform classical models. Its relatively low precision and recall make it less ideal for this dataset, at least without further optimization.

---

###Conclusion:

The **Decision Tree model** is the most suitable algorithm for this classification problem based on our evaluation. It offers a strong balance between predictive power and interpretability, making it ideal for our current dataset and problem scope.
