**Data** **cleaning**

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os

CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(CURRENT_DIR,"raw" , "ObesityDataSet.csv")

# Charger le dataset
df = pd.read_csv(file_path)

# Afficher les premières lignes pour vérifier le chargement des données
print("Aperçu des données :")
print(df.head())

# Vérifier les valeurs manquantes
print("
Valeurs manquantes par colonne :")
print(df.isnull().sum())

# Vérifier si certaines valeurs sont codées différemment comme manquantes
print("
Valeurs potentiellement manquantes sous d'autres formes ('?', 'None', '') :")
print(df.isin(['?', 'None', '']).sum())

# Obtenir un résumé des données
print("
Résumé des colonnes et valeurs non nulles :")
print(df.info())

import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# Affichage des outliers avec un boxplot
numerical_cols = df.select_dtypes(include=['number']).columns
df_numerical = df[numerical_cols]

plt.figure(figsize=(14, 6))
sns.boxplot(data=df_numerical)
plt.title("Boxplots des variables numériques", fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

# Suppression des outliers
lower_weight, upper_weight = 40, 105
lower_age, upper_age = 10, 26

df_filtered = df[
    (df['Weight'] >= lower_weight) & (df['Weight'] <= upper_weight) &
    (df['Age'] >= lower_age) & (df['Age'] <= upper_age)
]

# Sauvegarde des données nettoyées
processed_path = os.path.join(CURRENT_DIR, "data", "processed", "dataset.csv")

# Créer le dossier si inexistant
os.makedirs(os.path.dirname(processed_path), exist_ok=True)
df_filtered.to_csv(processed_path, index=False)
print(f'✅ Données nettoyées sauvegardées dans : {processed_path}')

# Encodage de la colonne cible
le = LabelEncoder()
df_filtered['NObeyesdad'] = le.fit_transform(df_filtered['NObeyesdad'])

# Sauvegarde de l'encodeur
encoder_path = os.path.join(CURRENT_DIR, "data", "processed", "label_encoder.pkl")
with open(encoder_path, "wb") as file:
    pickle.dump(le, file)

print(f'✅ Label Encoder sauvegardé dans : {encoder_path}')



**Class Distriburtion & Correlation**

In [None]:
#Class distribution
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os


CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(CURRENT_DIR, "dataset.csv")
df = pd.read_csv(file_path)

# Check class distribution and display percentages
class_counts = df["NObeyesdad"].value_counts(normalize=True) * 100  # Get percentages
print("Class Distribution:\n", class_counts)

# Plot Class Distribution with percentages displayed
plt.figure(figsize=(10, 5))
sns.barplot(x=class_counts.index, y=class_counts.values, palette="viridis")
for i, v in enumerate(class_counts.values):
    plt.text(i, v + 1, f'{v:.1f}%', ha='center', va='bottom', fontweight='bold')  # Adding percentages on top of bars
plt.xticks(rotation=45)
plt.xlabel("Obesity Level")
plt.ylabel("Percentage of Samples")
plt.title("Class Distribution of Obesity Levels")
plt.tight_layout()
plt.show()

# Box plot of all numerical columns
plt.figure(figsize=(10, 5))
sns.boxplot(data=df)
plt.xticks(rotation=90)
plt.title("Boxplot of Numerical Features")
plt.tight_layout()
plt.show()


########### Boxplot for Age vs. Obesity Level ##########
plt.figure(figsize=(12, 6))
sns.boxplot(x=df["NObeyesdad"], y=df["Age"], palette="coolwarm")
plt.xticks(rotation=45)
plt.title("Age Distribution Across Obesity Levels")
plt.show()

########### Boxplot for Weight vs. Obesity Level ##########
sns.boxplot(x=df["NObeyesdad"], y=df["Weight"], palette="coolwarm")
plt.xticks(rotation=45)
plt.title("Weight Distribution Across Obesity Levels")
plt.show()


########### Boxplot for Weight vs. Obesity Level ##########
plt.figure(figsize=(12, 6))
sns.boxplot(x=df["NObeyesdad"], y=df["Weight"], palette="coolwarm")
plt.xticks(rotation=45)
plt.title("Weight Distribution Across Obesity Levels")
plt.show()

########### Boxplot for Physical Activity Frequency (FAF) vs. Obesity Level ##########
plt.figure(figsize=(12, 6))
sns.boxplot(x=df["NObeyesdad"], y=df["FAF"], palette="coolwarm")
plt.xticks(rotation=45)
plt.title("Physical Activity (FAF) Across Obesity Levels")
plt.show()


########### Print Class Distribution ##########
class_counts = df["NObeyesdad"].value_counts(normalize=True) * 100
print("Class Distribution:\n", class_counts)


#understanding correlation and computing correlation matrix

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Charger les données
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(CURRENT_DIR, "dataset.csv")
df = pd.read_csv(file_path)

# Convert all categorical columns to numeric
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category').cat.codes

# Convert categorical target variable to numeric for correlation analysis
df["Obesity_Level_Num"] = df["NObeyesdad"].astype("category").cat.codes

# Compute correlation matrix
corr_matrix = df.corr()

# Print class distribution as percentages
print(df['NObeyesdad'].value_counts(normalize=True) * 100)

# Convert categorical features to numeric (using one-hot encoding)
df_encoded = pd.get_dummies(df, drop_first=True)

# Compute correlation matrix of the encoded data
correlation_matrix = df_encoded.corr()

# Plot heatmap of the correlation matrix after encoding
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Matrix ")
plt.show()

**Model TRaining**

In [None]:
import joblib
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import gc
import psutil
import os
import pickle
import shap


# === data loading ===
# Get the absolute path of the current script (inside views/)
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))

# Move up one level to reach the project root
BASE_DIR = os.path.abspath(os.path.join(CURRENT_DIR, ".."))

# Construct paths relative to the project root
DATA_PATH = os.path.join(BASE_DIR, "data","processed" , "dataset.csv")
df= pd.read_csv(DATA_PATH)

# === memory reducing ===
def optimize_dataframe(df):
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = df[col].astype('int32')
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    return df

df = optimize_dataframe(df)
categorical_columns = ["Gender", "family_history_with_overweight", "NObeyesdad","FAVC","SMOKE","CAEC","SCC","CALC","MTRANS",]

# ==== spliting data ====
# Apply Label Encoding

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoders if you need to inverse transform later

# Split into features (X) and target (y)
X = df.drop("NObeyesdad", axis=1)  # Features
y = df["NObeyesdad"]  # Target (Obesity Level)


# Split into 80% Training and 20% Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training Set Size:", X_train.shape)
print("Testing Set Size:", X_test.shape)


# ==== FLAGS FOR SAMPLING METHODS ====
USE_SMOTE = True           # Enable/Disable Oversampling
USE_UNDERSAMPLING = True  # Enable/Disable Undersampling
USE_CLASS_WEIGHTS = False  # Set this to True to use class weighting


# Apply Oversampling (SMOTE)
if USE_SMOTE:
    smote = SMOTE(sampling_strategy="auto", random_state=42)  # 60% oversampling
    X_train, y_train = smote.fit_resample(X_train, y_train)
    print("Applied SMOTE Oversampling. New Training Set Size:", X_train.shape)

# Apply Undersampling (RandomUnderSampler)
if USE_UNDERSAMPLING:
    undersample = RandomUnderSampler(sampling_strategy="auto", random_state=42)  # 80% of majority class
    X_train, y_train = undersample.fit_resample(X_train, y_train)
    print("Applied Random UnderSampling. New Training Set Size:", X_train.shape)

# Compute Class Weights (If Selected)
class_weight_dict = None
if USE_CLASS_WEIGHTS:
    class_weights = compute_class_weight(class_weight="balanced",
                                         classes=np.unique(y_train), y=y_train)
    class_weight_dict = {cls: weight for cls, weight in zip(np.unique(y_train), class_weights)}
    print("Applied Class Weights:", class_weight_dict)


# ===== training =====

# Initialize Model with Selected Class Weights
model = RandomForestClassifier(n_estimators=100, random_state=42,
                               class_weight=class_weight_dict if USE_CLASS_WEIGHTS else "balanced")
# Train Model
model.fit(X_train, y_train)


# Save the trained model to a file
with open("obesity_model.pkl", "wb") as file:
    pickle.dump(model, file)

print("Model saved successfully!")


# Train SHAP explainer after training your model
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)  # Ensure X_test matches training format


# Save SHAP explainer to a file
with open("shap_explainer.pkl", "wb") as file:
      pickle.dump(explainer, file)

print("SHAP explainer saved successfully as shap_explainer.pkl!")


# === Memmory Optimization ===

def get_memory_usage():
    process = psutil.Process()
    return process.memory_info().rss / (1024 * 1024)  # Convertir en Mo

memory_used = get_memory_usage()
print(f" Memory use after execution : {memory_used:.2f} Mo")
variables_a_supprimer = [var for var in globals().keys() if var not in ["get_memory_usage", "gc", "psutil", "pickle", "shap", "__name__", "__file__", "__builtins__"]]


for var in variables_a_supprimer:
    del globals()[var]

gc.collect()
print(" Memory freed !")


**Model** **Evaluation**

In [None]:
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import gc
import psutil

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.model_selection import train_test_split

# === DATA & MODEL LOADING ===
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
BASE_DIR = os.path.abspath(os.path.join(CURRENT_DIR, ".."))

# Construct paths
DATA_PATH = os.path.join(BASE_DIR, "data", "processed", "dataset.csv")
MODEL_PATH = os.path.join(BASE_DIR, "models", "obesity_model.pkl")

# Load dataset
df = pd.read_csv(DATA_PATH)

# Load trained model
if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(f"Model file not found: {MODEL_PATH}")

print(f"Loading model from: {MODEL_PATH}")
model = joblib.load(MODEL_PATH)

# === DATA PREPARATION ===
categorical_columns = ["Gender", "family_history_with_overweight", "FAVC", "SMOKE",
                       "CAEC", "SCC", "CALC", "MTRANS", "NObeyesdad"]

# Apply Label Encoding
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoders for inverse transform if needed

# Split features & target variable
X = df.drop("NObeyesdad", axis=1)
y = df["NObeyesdad"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === PREDICTION ===
y_pred = model.predict(X_test)

# === MODEL EVALUATION ===
accuracy = accuracy_score(y_test, y_pred)
print(f" Accuracy: {accuracy:.2f}")

print("\n Classification Report:\n", classification_report(y_test, y_pred))

# === CONFUSION MATRIX ===
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=set(y_test), yticklabels=set(y_test))
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

# === ROC-AUC SCORE ===
y_test_binarized = label_binarize(y_test, classes=np.unique(y_test))
roc_auc = roc_auc_score(y_test_binarized, model.predict_proba(X_test), multi_class="ovr")
print(f" ROC-AUC Score: {roc_auc:.4f}")

# === MEMORY OPTIMIZATION ===
def get_memory_usage():
    process = psutil.Process()
    return process.memory_info().rss / (1024 * 1024)  # Convert to MB

memory_used = get_memory_usage()
print(f" Memory usage after execution: {memory_used:.2f} MB")

# Clean up memory
variables_to_keep = {"get_memory_usage", "gc", "psutil", "__name__", "__file__", "__builtins__"}
for var in list(globals().keys()):
    if var not in variables_to_keep:
        del globals()[var]

gc.collect()
print(" Memory freed!")


**Shap** **Explainer**

In [None]:
import joblib
import shap
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# === DATA AND MODEL LOADING ===
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
BASE_DIR = os.path.abspath(os.path.join(CURRENT_DIR, ".."))
DATA_PATH = os.path.join(BASE_DIR, "data", "processed", "dataset.csv")
MODEL_PATH = os.path.join(BASE_DIR, "models", "obesity_model.pkl")

if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(f"Model file not found: {MODEL_PATH}")

print(f"🔹 Loading model from: {MODEL_PATH}")
model = joblib.load(MODEL_PATH)

# === DATA PREPARATION ===
df = pd.read_csv(DATA_PATH)

categorical_columns = ["Gender", "family_history_with_overweight", "FAVC", "SMOKE",
                       "CAEC", "SCC", "CALC", "MTRANS"]

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoders for inverse transform if needed

X = df.drop("NObeyesdad", axis=1)
y = df["NObeyesdad"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# === SHAP EXPLAINER ===
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# === Ploting Shap ===
shap.summary_plot(shap_values, X_test)

print(" SHAP analysis completed successfully!")


***tests***


In [None]:
import sys
import os
import pandas as pd
import pickle
import numpy as np
import joblib

# Ensure we can import modules from the main project directory
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

# Import the prediction function from the obesity controller
from controllers.obesity_controller import predict_obesity

# Get the absolute path of the current script (inside views/)
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))

# Move up one level to reach the project root
BASE_DIR = os.path.abspath(os.path.join(CURRENT_DIR, ".."))

# Define paths for data and model
DATA_PATH = os.path.join(BASE_DIR, "data", "processed", "dataset.csv")
MODEL_PATH = os.path.join(BASE_DIR, "models", "obesity_model.pkl")

# Ensure consistent feature selection
EXPECTED_FEATURES = [
    "Gender", "Age", "Height", "Weight", "family_history_with_overweight",
    "FAVC", "FCVC", "NCP", "CAEC", "SMOKE", "CH2O", "SCC", "FAF", "TUE", "CALC", "MTRANS"
]

# Load the trained model
if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(f"⚠ Model file not found at {MODEL_PATH}")

with open(MODEL_PATH, "rb") as file:
    model = pickle.load(file)

# Test functions

def test_prediction():
    """Test a prediction with valid values"""
    features = {
        "Age": 30, "Height": 1.80, "Weight": 80,
        "Activity": 2, "Food_Intake": 2, "Vegetables": 3, "Water_Intake": 2, "Smoking": 0
    }
    prediction = predict_obesity(features)
    valid_classes = ["Poids Insuffisant", "Poids Normal", "Surpoids Niveau I",
                     "Surpoids Niveau II", "Obésité Type I", "Obésité Type II", "Obésité Type III"]

    assert prediction in valid_classes, f"⚠ Invalid prediction: {prediction}"

def test_dataset_loading():
    """Test loading the dataset"""
    df = pd.read_csv(DATA_PATH)
    assert not df.empty, "⚠ The dataset is empty"

def test_data_types():
    """Test that numerical columns are in the correct format"""
    df = pd.read_csv(DATA_PATH)
    numerical_columns = ["Age", "Height", "Weight"]

    for col in numerical_columns:
        assert df[col].dtype in ["int64", "float64"], f"⚠ Wrong data type for {col}"

def test_model_loading():
    """Test if the model loads correctly"""
    assert model is not None, "⚠ Model failed to load!"

def test_model_prediction():
    """Test model prediction with a sample input"""
    sample_input = pd.DataFrame([{
        "Gender": 1, "Age": 30, "Height": 170, "Weight": 75,
        "family_history_with_overweight": 1, "FAVC": 1, "FCVC": 2.5, "NCP": 3,
        "CAEC": 2, "SMOKE": 0, "CH2O": 2, "SCC": 1, "FAF": 1.5, "TUE": 1.5, "CALC": 1, "MTRANS": 2
    }])

    # Ensure only expected features are used
    sample_input = sample_input[EXPECTED_FEATURES]

    # Check that the input matches model expectations
    assert sample_input.shape[1] == len(EXPECTED_FEATURES), f"⚠ Expected {len(EXPECTED_FEATURES)} features, got {sample_input.shape[1]}"

    # Run prediction
    prediction = model.predict(sample_input)

    # Check that output is a valid class
    valid_classes = [0, 1, 2, 3, 4, 5, 6]  # Adjust based on your label encoding
    assert prediction[0] in valid_classes, f"⚠ Invalid prediction output: {prediction[0]}"

# Run all tests
def run_tests():
    """Run all test functions"""
    test_prediction()
    test_dataset_loading()
    test_data_types()
    test_model_loading()
    test_model_prediction()
    print("✅ All tests passed successfully!")

# Execute the tests
if __name__ == "__main__":
    run_tests()