1. Prepare features and Target
2. Split into train test split
3. Train models
4. Evaluate models
5. Compare matrix and select the best model
6. Save the model using joblib or pickle

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [7]:
df = pd.read_csv('cleaned_nova_data.csv')
df.head()

Unnamed: 0,nutriscore,nova_group,energy,fat,saturated-fat,cholesterol,carbohydrates,sugars,fiber,proteins,salt,sodium
0,-5.0,1.0,238.0,0.2,0.1,0.0,3.9,3.9,0.0,10.0,0.09,0.036
1,18.0,4.0,936.0,8.2,2.2,0.0,29.0,22.0,0.0,5.1,4.6,1.84
2,9.0,3.0,264.0,0.0,0.0,0.0,15.5,15.5,0.0,0.2,0.0,0.0
3,-4.0,1.0,134.0,0.3,0.1,0.0,5.3,3.9,0.0,0.9,0.42,0.168
4,14.0,4.0,916.0,5.9,0.5,0.0,30.3,1.7,2.8,9.7,0.464,0.1856


In [9]:
#  Stratified Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [15]:
# Undersample majority & SMOTE minority
from imblearn.combine import SMOTETomek
sm = SMOTETomek(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [17]:
X_train_res

Unnamed: 0,nutriscore,energy,fat,saturated-fat,cholesterol,carbohydrates,sugars,fiber,proteins,salt,sodium
0,19.000000,1729.000000,13.333333,6.666667,0.000000,63.333333,0.000000,0.000000,10.000000,1.833333,0.733333
1,13.000000,628.000000,11.670000,0.000000,0.000000,10.000000,10.000000,0.000000,0.000000,2.500000,1.000000
2,14.000000,1619.000000,0.000000,0.000000,0.000000,90.320000,41.940000,3.200000,3.230000,0.927500,0.371000
3,14.000000,1540.000000,0.000000,0.000000,0.000000,84.210000,63.160000,0.000000,7.890000,1.185000,0.474000
4,-4.000000,2958.000000,67.860000,3.570000,0.000000,10.710000,0.000000,10.700000,17.860000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
352883,10.000000,970.346461,10.162129,3.538664,0.021039,25.369207,2.878664,1.313071,9.472671,1.395414,0.558166
352884,10.000000,3347.000000,90.000000,10.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
352885,7.043822,1987.065733,34.890444,7.527389,0.000000,39.945222,12.609556,17.445222,15.219111,0.151604,0.060642
352886,9.871480,1506.000000,4.128520,1.825704,0.000000,75.389964,29.738556,6.000000,4.000000,0.956820,0.382728


In [19]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Define models
rf_model = RandomForestClassifier(random_state=42)
svm_model = SVC(probability=True, random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)


In [None]:
rf_model.fit(X_train_res, y_train_res)
#svm_model.fit(X_train_res, y_train_res)
gb_model.fit(X_train_res, y_train_res)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Create an empty list to store evaluation results
results = []

# Dictionary of trained models
models = {
    "Random Forest": rf_model,
    #"SVM": svm_model,
    "Gradient Boosting": gb_model
}

# Loop through each model
for name, model in models.items():
    y_pred = model.predict(X_test)
    
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1 Score": f1_score(y_test, y_pred, average='weighted')
    })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

# Display the summary table
print(results_df)


In [None]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)         # Save

with open('model.pkl', 'rb') as f:
    model = pickle.load(f)        # Load
