In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import pickle
import os

# Load the original dataset
try:
    df = pd.read_csv('food_nutrient_temperament.csv')
except FileNotFoundError:
    print("Error: File 'food_nutrient_temperament.csv' not found.")
    exit()

# Preprocessing
required_columns = ['food_ description', 'Temperament']
if not all(col in df.columns for col in required_columns):
    print(f"Error: Missing columns {required_columns} in DataFrame.")
    exit()

X = df.drop(columns=['food_ description', 'Temperament']).fillna(0)
y = df['Temperament']
feature_names = X.columns

# Filter valid Temperament values
valid_temperaments = [0, 1, 2]  # 0: Cold, 1: Hot, 2: Moderate
valid_mask = df['Temperament'].isin(valid_temperaments)
X = X[valid_mask]
y = y[valid_mask]
print("Number of valid samples:", len(y))
print("Class Distribution:\n", y.value_counts())

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE
try:
    smote = SMOTE(random_state=42, k_neighbors=3)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    print("Class Distribution after SMOTE:\n", pd.Series(y_train_res).value_counts())
except ValueError as e:
    print("SMOTE failed:", e)
    X_train_res, y_train_res = X_train, y_train

# Train Gradient Boosting model (using best parameters)
gb = GradientBoostingClassifier(
    n_estimators=50, max_depth=3, learning_rate=0.1, random_state=42
)
gb.fit(X_train_res, y_train_res)

# Save the model and scaler
with open('gb_model.pkl', 'wb') as f:
    pickle.dump(gb, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Load new dataset (31 samples, no Temperament)
try:
    new_data = pd.read_csv('New data food nutrient.csv')
except FileNotFoundError:
    print("Error: File 'New_data.csv' not found.")
    exit()

# Verify new data structure
if 'food_ description' not in new_data.columns:
    print("Error: 'food_ description' column missing in new data.")
    exit()
if new_data.drop(columns=['food_ description']).shape[1] != len(feature_names):
    print(f"Error: New data must have the same features as original data: {feature_names.tolist()}")
    exit()

# Ensure new_data has the same features
new_data = new_data[feature_names.tolist() + ['food_ description']].fillna(0)

# Function to predict temperament
def predict_temperament(new_data, model, scaler, feature_names):
    X_new = new_data.drop(columns=['food_ description']).fillna(0)
    X_new_scaled = scaler.transform(X_new)
    predictions = model.predict(X_new_scaled)
    probabilities = model.predict_proba(X_new_scaled)
    return predictions, probabilities

# Function for human-in-the-loop validation
def human_in_the_loop(new_data, model, scaler, feature_names, dataset_file='food_nutrient_temperament.csv'):
    predictions, probabilities = predict_temperament(new_data, model, scaler, feature_names)
    temperament_map = {0: 'Cold', 1: 'Hot', 2: 'Moderate'}
    
    validated_data = new_data.copy()
    validated_temperaments = []
    
    for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
        food_desc = new_data.iloc[i]['food_ description']
        pred_label = temperament_map[pred]
        max_prob = np.max(prob) * 100
        prob_dist = {temperament_map[j]: f"{prob[j]*100:.2f}%" for j in range(len(prob))}
        
        print(f"\nFood: {food_desc}")
        print(f"Predicted Temperament: {pred_label} (Confidence: {max_prob:.2f}%)")
        print(f"Probability Distribution: {prob_dist}")
        print("Options: [0: Cold, 1: Hot, 2: Moderate, -1: Skip]")
        
        while True:
            user_input = input("Enter correct Temperament (0, 1, 2, or -1 to skip): ")
            try:
                user_input = int(user_input)
                if user_input in [-1, 0, 1, 2]:
                    break
                print("Invalid input. Please enter 0, 1, 2, or -1.")
            except ValueError:
                print("Invalid input. Please enter a number (0, 1, 2, or -1).")
        
        validated_temperaments.append(user_input if user_input != -1 else None)
    
    # Add validated temperaments
    validated_data['Temperament'] = validated_temperaments
    validated_data = validated_data[validated_data['Temperament'].notnull()]
    
    if not validated_data.empty:
        df_updated = pd.concat([pd.read_csv(dataset_file), validated_data], ignore_index=True)
        df_updated.to_csv(dataset_file, index=False)
        print(f"\nValidated data added to {dataset_file}. New dataset size: {len(df_updated)}")
    else:
        print("\nNo validated data added.")
    
    return validated_data, len(validated_data)

# Run human-in-the-loop validation
validated_data, num_validated = human_in_the_loop(new_data, gb, scaler, feature_names)

# Retrain model with updated dataset
if num_validated > 0:
    df_updated = pd.read_csv('food_nutrient_temperament.csv')
    X_updated = df_updated.drop(columns=['food_ description', 'Temperament']).fillna(0)
    y_updated = df_updated['Temperament']
    valid_mask = y_updated.isin(valid_temperaments)
    X_updated = X_updated[valid_mask]
    y_updated = y_updated[valid_mask]
    
    X_updated_scaled = scaler.fit_transform(X_updated)
    X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(
        X_updated_scaled, y_updated, test_size=0.2, random_state=42, stratify=y_updated
    )
    
    try:
        X_train_res_up, y_train_res_up = smote.fit_resample(X_train_up, y_train_up)
        gb.fit(X_train_res_up, y_train_res_up)
        y_pred_up = gb.predict(X_test_up)
        print("\nUpdated Gradient Boosting Test Accuracy:", accuracy_score(y_test_up, y_pred_up))
        print("Updated Classification Report:\n", classification_report(y_test_up, y_pred_up, target_names=['Cold', 'Hot', 'Moderate']))
        
        # Save updated model
        with open('gb_model_updated.pkl', 'wb') as f:
            pickle.dump(gb, f)
    except ValueError as e:
        print("SMOTE failed for updated dataset:", e)
else:
    print("\nNo retraining performed due to no validated data.")