In [None]:
1. Prepare features and Target
2. Split into train test split
3. Train models
4. Evaluate models
5. Compare matrix and select the best model
6. Save the model using joblib or pickle

In [None]:
#under sample the majority classes and over sample the minority classes

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [27]:
df = pd.read_csv('cleaned_nova_data.csv')
df.head(3)

Unnamed: 0,nutriscore,nova_group,energy,fat,saturated-fat,cholesterol,carbohydrates,sugars,fiber,proteins,salt,sodium
0,-5.0,1.0,238.0,0.2,0.1,0.0,3.9,3.9,0.0,10.0,0.09,0.036
1,18.0,4.0,936.0,8.2,2.2,0.0,29.0,22.0,0.0,5.1,4.6,1.84
2,9.0,3.0,264.0,0.0,0.0,0.0,15.5,15.5,0.0,0.2,0.0,0.0


In [29]:
df.shape

(208889, 12)

In [33]:
X = df.drop(columns=['nutriscore','nova_group'])
y = df['nova_group']

In [35]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# STEP 1: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)




In [37]:
# STEP 2: Undersample the majority class (nova_group=4.0)
under_sampler = RandomUnderSampler(sampling_strategy={4.0: 50000}, random_state=42)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)

In [39]:
# STEP 3: Apply SMOTE only on minority classes to reach 40,000
smote = SMOTE(sampling_strategy={1.0: 40000, 2.0: 40000, 3.0: 40000}, random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_under, y_train_under)

In [41]:
X_train_balanced 

Unnamed: 0,energy,fat,saturated-fat,cholesterol,carbohydrates,sugars,fiber,proteins,salt,sodium
0,2958.000000,67.860000,3.570000,0.00000,10.710000,0.000000,10.700000,17.860000,0.000000,0.000000
1,310.000000,0.830000,0.410000,0.00200,14.460000,4.550000,1.200000,2.070000,0.610000,0.244000
2,456.000000,3.640000,0.910000,0.01600,13.640000,0.910000,4.100000,5.450000,1.080000,0.432000
3,1395.000000,2.000000,0.400000,0.00000,72.000000,2.666667,8.000000,13.333333,0.016667,0.006667
4,0.000000,0.000000,4.508496,0.00000,0.000000,15.654596,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
169995,0.000000,0.000000,4.508496,0.00000,0.000000,15.654596,0.000000,0.000000,78.295107,31.318043
169996,71.000000,0.000000,0.000000,0.00000,3.330000,3.330000,0.000000,0.000000,2.582500,1.033000
169997,720.639757,5.610313,2.460282,0.01712,21.340464,2.169193,1.068012,7.416443,1.463756,0.585503
169998,2092.000000,25.000000,1.790000,0.00000,64.290000,3.570000,7.100000,7.140000,1.308271,0.523309


In [43]:
x_train = X_train_balanced
Y_train = y_train_balanced

In [45]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Define models
rf_model = RandomForestClassifier(random_state=42)
svm_model = SVC(probability=True, random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)

In [47]:
rf_model.fit(x_train, Y_train)
#svm_model.fit(x_train, Y_train)


In [49]:
gb_model.fit(x_train, Y_train)

In [51]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Create an empty list to store evaluation results
results = []

# Dictionary of trained models
models = {
    "Random Forest": rf_model,
    #"SVM": svm_model,
    "Gradient Boosting": gb_model
}

# Loop through each model
for name, model in models.items():
    y_pred = model.predict(X_test)
    
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1 Score": f1_score(y_test, y_pred, average='weighted')
    })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

# Display the summary table
print(results_df)


               Model  Accuracy  Precision    Recall  F1 Score
0      Random Forest  0.904399   0.910509  0.904399  0.906232
1  Gradient Boosting  0.829623   0.852717  0.829623  0.837115


from joblib import dump, load
dump(model, 'model.joblib')       # Save
model = load('model.joblib')      # Load
