# **ICS5110 Notebook**

View the web page for this project [here](https://mkenely.com/ics5110).

- [Feature Reference](https://mkenely.com/ics5110/features)
- [Feature Distributions](https://mkenely.com/ics5110/distributions)
- [Correlation Matrix](https://mkenely.com/ics5110/correlation_matrix)
- [Feature vs G3 Scatter Plots](https://mkenely.com/ics5110/scatter_plots)


### **Imports**

In [None]:
import os
import sys

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import pickle

from gradio_implementations import pca_gradio
from gradio_implementations import ensemble_gradio
from gradio_implementations import kmc_gradio
from gradio_implementations import lr_gradio

import matplotlib.pyplot as plt

### **Data**

In [2]:
portugese_df = pd.read_csv('./data/Portuguese.csv')

le = LabelEncoder()
encoding_mappings = {}

for column in portugese_df.columns:
    if portugese_df[column].dtype == 'object':
        portugese_df[column] = le.fit_transform(portugese_df[column])
        encoding_mappings[column] = {index: label for index, label in enumerate(le.classes_)}

X = portugese_df.drop('G3', axis=1)
X = X.drop('G1', axis=1)
X = X.drop('G2', axis=1)

y = portugese_df['G3']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### **Models**

#### **Ensemble (David)**

**Imports**

In [12]:
import warnings

from sklearn.linear_model import LogisticRegression # to use Logistic Regression for step of stacking 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier # to use RF and GB as base models + the stacked model
from sklearn.metrics import accuracy_score # to get the accuracy

In [13]:
TARGET = "G3" # the target variable

# ONE_BOUND is the bounding area from the extremes (ie. 1 and -1), to remove the models that are too correlated with the target
# ZERO_BOUND is the bounding area from the middle (ie. 0), to remove the models that are too uncorrelated with the target
ONE_BOUND = 0.8
ZERO_BOUND = 0.065

TEST_SIZE = 0.2

**Models**

In [14]:
# the two base models for the stacking model
random_forest_model = RandomForestClassifier(n_estimators=100)
gradient_boosting_model = GradientBoostingClassifier(n_estimators=100)

# stacking model that uses a 5 fold cross validation scheme (cv)
stacking_ensemble_model = StackingClassifier(estimators=[("random_forest", random_forest_model), ("gradient_boosting", gradient_boosting_model)], final_estimator=LogisticRegression(), cv=5)

In [15]:
correlation_matrix = portugese_df.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), k=1)
correlation_matrix = correlation_matrix.mask(mask)

In [16]:
def filter_features(correlation_matrix, dataframe):
    original_features = dict(correlation_matrix.loc[TARGET])
    new_features = []
    removed_features = []

    # Iterate through features to determine which to keep or remove
    for key in original_features:
        if key == TARGET:
            new_features.append(key)  # Add target to the new dataframe
        elif original_features[key] > ONE_BOUND or original_features[key] < -ONE_BOUND:
            removed_features.append(key)  # High correlation
        elif -ZERO_BOUND < original_features[key] < ZERO_BOUND:
            removed_features.append(key)  # Low correlation
        else:
            new_features.append(key)  # Acceptable correlation range

    # Create the new dataframe
    new_data = {attribute: dataframe[attribute] for attribute in new_features}
    new_dataframe = pd.DataFrame(new_data)

    return new_dataframe, new_features, removed_features


In [17]:
def run_the_model(dataframe, model_name, model_to_run):
    # Split dataframe into features and target
    features = dataframe.drop(columns=[TARGET])
    target = dataframe[TARGET]

    # Train-test split
    feature_train, feature_test, target_train, target_test = train_test_split(
        features, target, test_size=TEST_SIZE
    )

    # Train the model
    model_to_run.fit(feature_train, target_train)

    # Make predictions and calculate accuracy
    test_predictions = model_to_run.predict(feature_test)
    accuracy = accuracy_score(target_test, test_predictions)
    print(f"{model_name} Accuracy:\t{accuracy * 100:.3f}%")

    plt.figure(figsize=(8, 6))
    plt.scatter(target_test, test_predictions, alpha=0.6, color='blue')

    # Add a diagonal line for reference (perfect predictions)
    min_val = min(min(target_test), min(test_predictions))
    max_val = max(max(target_test), max(test_predictions))
    plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', linewidth=2)

    # Add labels and title
    plt.title('Prediction vs Actual Values')
    plt.xlabel('Actual G3')
    plt.ylabel('Predicted G3')
    plt.grid(True)
    plt.show()


In [3]:
def run_tests():
    print(f"Correlation values in range: {ONE_BOUND} to {ZERO_BOUND} and -{ZERO_BOUND} to -{ONE_BOUND}\n")
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UserWarning)
        warnings.filterwarnings("ignore", category=RuntimeWarning)

        temp = filter_features(correlation_matrix, portugese_df)
        new_dataset, kept_features, removed_features = temp[0], temp[1], temp[2]

        print(f"List of Kept Features:\t\t\t{kept_features}")
        print(f"List of Removed Features:\t\t\t{removed_features}")
        print(
            f"Original / Removed / Kept:\t\t"
            f"{len(portugese_df.keys())} / {len(removed_features)} / {len(kept_features)}"
        )

        run_the_model(new_dataset, "\nStacking Ensemble Model", stacking_ensemble_model)

        # # Save to pickle for gradio
        # with open('../gradio/ensemble_gradio/models/kept_features.pkl', 'wb') as f:
        #     pickle.dump(kept_features, f)

        # with open('../gradio/ensemble_gradio/models/stacking_ensemble_model.pkl', 'wb') as f:
        #     pickle.dump(stacking_ensemble_model, f)

In [None]:
# == All Features ==
# ONE_BOUND = 1 ; ZERO_BOUND = 0
# run_tests()

# == No High Correlation Features ==
# ONE_BOUND = 0.8 ; ZERO_BOUND = 0 
# run_tests()

# == No Low Correlation Features ==
# ONE_BOUND = 1 ; ZERO_BOUND = 0.065 
# run_tests()

# == No High and Low Correlation Features ==
ONE_BOUND = 0.8 ; ZERO_BOUND = 0.065 
run_tests()

### **Gradio**

#### **Ensemble**

In [None]:
# Drop G3 from kept features
kept_features.remove('G3')
ensemble_gradio.make_gradio(kept_features, stacking_ensemble_model)

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.
