In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Load clean data
df = pd.read_csv("../data/cleaned/food_cleaned.csv")

# Columns so I know what I have
print("Columns in food_cleaned.csv:", df.columns.tolist())

# Recreate Diabetic Suitability label, only keeping the columns that are relevant
columns_to_keep = [
    'Description',
    'Data.Sugar Total',
    'Data.Carbohydrate',
    'Data.Fiber',
    'Data.Kilocalories',
    'Data.Fat.Saturated Fat'
]
df = df[columns_to_keep]

# Creating the label again
def is_suitable(row):
    sugar = row['Data.Sugar Total']
    carbs = row['Data.Carbohydrate']
    sat_fat = row['Data.Fat.Saturated Fat']
    
    if pd.isnull(sugar) or pd.isnull(carbs) or pd.isnull(sat_fat):
        return None
    if sugar > 15 or carbs > 45 or sat_fat > 5:
        return 0
    else:
        return 1

df['Diabetic_Suitability'] = df.apply(is_suitable, axis=1)

# Checking how many suitable vs not suitable
print(df['Diabetic_Suitability'].value_counts())

# New labelled saved
df.to_csv("../data/cleaned/food_labelled.csv", index=False)
print("New food_labelled.csv saved!")

# Features and targets
y = df['Diabetic_Suitability']
X = df.drop(['Description', 'Diabetic_Suitability'], axis=1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training shape:", X_train.shape)
print("Test shape:", X_test.shape)



Columns in food_cleaned.csv: ['Description', 'Data.Sugar Total', 'Data.Carbohydrate', 'Data.Fiber', 'Data.Kilocalories', 'Data.Fat.Saturated Fat']
Diabetic_Suitability
1    4536
0    2877
Name: count, dtype: int64
New food_labelled.csv saved!
Training shape: (5930, 5)
Test shape: (1483, 5)


In [3]:
#Decision Tree 
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

y_pred_tree = tree_model.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_tree))
print("\nClassification Report:\n", classification_report(y_test, y_pred_tree))


Decision Tree Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       587
           1       1.00      1.00      1.00       896

    accuracy                           1.00      1483
   macro avg       1.00      1.00      1.00      1483
weighted avg       1.00      1.00      1.00      1483



In [4]:
#Logistic Regression
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)

y_pred_log = log_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log))


Logistic Regression Accuracy: 0.9291975724881996

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.90      0.91       587
           1       0.94      0.95      0.94       896

    accuracy                           0.93      1483
   macro avg       0.93      0.92      0.93      1483
weighted avg       0.93      0.93      0.93      1483

