In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix




In [5]:
file_name = "ObesityDataSet_raw_and_data_sinthetic.csv"
df = pd.read_csv(file_name)
df


Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


In [9]:
df.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2087.0,2087.0,2087.0,2087.0,2087.0,2087.0,2087.0,2087.0
mean,24.35309,1.702674,86.85873,2.421466,2.701179,2.004749,1.012812,0.663035
std,6.368801,0.093186,26.190847,0.534737,0.764614,0.608284,0.853475,0.608153
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.915937,1.630178,66.0,2.0,2.697467,1.590922,0.124505,0.0
50%,22.847618,1.701584,83.1011,2.396265,3.0,2.0,1.0,0.630866
75%,26.0,1.769491,108.015907,3.0,3.0,2.466193,1.678102,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2087 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2087 non-null   object 
 1   Age                             2087 non-null   float64
 2   Height                          2087 non-null   float64
 3   Weight                          2087 non-null   float64
 4   family_history_with_overweight  2087 non-null   object 
 5   FAVC                            2087 non-null   object 
 6   FCVC                            2087 non-null   float64
 7   NCP                             2087 non-null   float64
 8   CAEC                            2087 non-null   object 
 9   SMOKE                           2087 non-null   object 
 10  CH2O                            2087 non-null   float64
 11  SCC                             2087 non-null   object 
 12  FAF                             2087 no

In [6]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)
print(f"Number of rows after removing duplicates: {len(df)}")



Number of rows after removing duplicates: 2087


In [7]:
# --- 2. Data Preparation and Preprocessing ---

# Separate features (X) and target (y)
X = df.drop('NObeyesdad', axis=1)
y = df['NObeyesdad']

# Identify column types
binary_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
nominal_cols = ['CAEC', 'CALC', 'MTRANS']
numerical_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# 2.1. Target Label Encoding
# We need to encode the target variable for modeling
le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y)
print("\nTarget Classes and their Encodings:")
for i, class_name in enumerate(le_y.classes_):
    print(f"  {class_name}: {i}")

# 2.2. Feature Label Encoding (for binary/ordinal categorical features)
le_features = LabelEncoder()
for col in binary_cols:
    X[col] = le_features.fit_transform(X[col])

# 2.3. Feature One-Hot Encoding (for nominal categorical features)
X = pd.get_dummies(X, columns=nominal_cols, drop_first=True)

# 2.4. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)
print(f"\nTraining set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

# 2.5. Standard Scaling (on numerical features only)
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

print("\nData Preprocessing Complete: Features are encoded and scaled.")


Target Classes and their Encodings:
  Insufficient_Weight: 0
  Normal_Weight: 1
  Obesity_Type_I: 2
  Obesity_Type_II: 3
  Obesity_Type_III: 4
  Overweight_Level_I: 5
  Overweight_Level_II: 6

Training set size: 1460
Testing set size: 627

Data Preprocessing Complete: Features are encoded and scaled.


In [2]:
# --- 3. Model Training: Logistic Regression ---
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# --- 4. Model Evaluation ---
y_pred = model.predict(X_test)

# 4.1. Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Model Evaluation (Logistic Regression) ---")
print(f"Accuracy Score: {accuracy:.4f}")

# 4.2. Classification Report
report = classification_report(
    y_test, y_pred, target_names=le_y.classes_, output_dict=True, zero_division=0
)
print("\nClassification Report:")
print(pd.DataFrame(report).transpose())

# 4.3. Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=le_y.classes_,
    yticklabels=le_y.classes_
)
plt.title('Confusion Matrix for Logistic Regression')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('logistic_regression_confusion_matrix.png')
plt.close()

print("\nConfusion Matrix visualization saved as 'logistic_regression_confusion_matrix.png'.")


--- Model Evaluation (Logistic Regression) ---
Accuracy Score: 0.8995

Classification Report:
                     precision    recall  f1-score     support
Insufficient_Weight   0.929412  0.987500  0.957576   80.000000
Normal_Weight         0.864198  0.823529  0.843373   85.000000
Obesity_Type_I        0.899083  0.924528  0.911628  106.000000
Obesity_Type_II       0.955556  0.966292  0.960894   89.000000
Obesity_Type_III      0.989691  0.989691  0.989691   97.000000
Overweight_Level_I    0.817073  0.807229  0.812121   83.000000
Overweight_Level_II   0.819277  0.781609  0.800000   87.000000
accuracy              0.899522  0.899522  0.899522    0.899522
macro avg             0.896327  0.897197  0.896469  627.000000
weighted avg          0.898327  0.899522  0.898646  627.000000

Confusion Matrix visualization saved as 'logistic_regression_confusion_matrix.png'.


In [3]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

# --- 1. Load Data and Clean ---
file_name = "ObesityDataSet_raw_and_data_sinthetic.csv"
df = pd.read_csv(file_name)
df.drop_duplicates(inplace=True)

# --- 2. Define Columns and Prepare Data ---
binary_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
nominal_cols = ['CAEC', 'CALC', 'MTRANS']
numerical_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# Separate features (X) and target (y)
X = df.drop('NObeyesdad', axis=1)
y = df['NObeyesdad']

# 2.1. Target Label Encoding
le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y)

# 2.2. Feature Label Encoding (for binary/ordinal categorical features)
le_features = LabelEncoder()
for col in binary_cols:
    X[col] = le_features.fit_transform(X[col])

# 2.3. Feature One-Hot Encoding (for nominal categorical features)
X = pd.get_dummies(X, columns=nominal_cols, drop_first=True)

# 2.4. Split Data (Needed for fitting the scaler on training data only)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

# 2.5. Standard Scaling (Fit only on training data)
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])

# --- 3. Train Model ---
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# --- 4. SAVE THE ARTIFACTS ---

# Save the fitted scaler object
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

# Save the target label encoder
with open('le_y.pkl', 'wb') as file:
    pickle.dump(le_y, file)

# Save the trained model
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Save the final list of feature columns (CRUCIAL for consistent input order)
feature_columns = X_train.columns.tolist()
with open('feature_columns.pkl', 'wb') as file:
    pickle.dump(feature_columns, file)

print("Model, scaler, target encoder, and feature columns saved successfully to .pkl files.")

Model, scaler, target encoder, and feature columns saved successfully to .pkl files.
