# Coding Block 1 - Random Forests (and XGBoost)

### Load the packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import xgboost as xgb
import matplotlib.pyplot as plt
'''
...
'''

'\n...\n'

### Read the dataset 
You can also compare processed and non-processed data.

In [2]:
diab=pd.read_csv('diabetes.csv')
diab_cleaned=pd.read_csv('diabetes_cleaned.csv')

### Split the data and train a Random Forest model
### Evaluate the prediction models using a classification report
### Print the feature importances of the random forest
### Extra: Also train a XGBoost model and compare the results.

In [None]:

datasets = {
    "Original Dataset": diab,
    "Cleaned Dataset": diab_cleaned
}

for dataset_name, dataset in datasets.items():
    print(f"\n{'='*80}")
    print(f"Analysis for {dataset_name}")
    print(f"{'='*80}")
    
    # Split features and target
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # (i) Train a Random Forest model
    print("\n--- Random Forest Model ---")
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # (ii) Evaluate with classification report
    y_pred_rf = rf_model.predict(X_test)
    print("\nClassification Report for Random Forest:")
    print(classification_report(y_test, y_pred_rf))
    
    # (iii) Print feature importances
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nFeature Importances for Random Forest:")
    print(feature_importances)
    
    # Plot feature importances
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importances['Feature'], feature_importances['Importance'])
    plt.xlabel('Importance')
    plt.title(f'Random Forest Feature Importances - {dataset_name}')
    plt.gca().invert_yaxis()  # Display the highest importance at the top
    plt.tight_layout()
    plt.show()
    
    # (iv) Train XGBoost model
    print("\n--- XGBoost Model ---")
    xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
    xgb_model.fit(X_train, y_train)
    
    # Evaluate XGBoost
    y_pred_xgb = xgb_model.predict(X_test)
    print("\nClassification Report for XGBoost:")
    print(classification_report(y_test, y_pred_xgb))
    
    # XGBoost feature importances
    xgb_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': xgb_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nFeature Importances for XGBoost:")
    print(xgb_importances)
    
    # Plot XGBoost feature importances
    plt.figure(figsize=(10, 6))
    plt.barh(xgb_importances['Feature'], xgb_importances['Importance'])
    plt.xlabel('Importance')
    plt.title(f'XGBoost Feature Importances - {dataset_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    # Compare the models
    print("\n--- Model Comparison ---")
    print(f"Random Forest Accuracy: {rf_model.score(X_test, y_test):.4f}")
    print(f"XGBoost Accuracy: {xgb_model.score(X_test, y_test):.4f}")
    
    # Check if there are differences in the predictions
    disagreements = np.sum(y_pred_rf != y_pred_xgb)
    print(f"Number of disagreements between models: {disagreements} out of {len(y_test)} samples")

ModuleNotFoundError: No module named 'xgboost'