In [1]:
# Aim is to create a model that can predict the final grade (G3) from the rest of the dataset. 
# ie. G3 is the target label

## **Imports & Constants**

In [2]:
import os
import sys

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [3]:
TARGET = "G3"
ONE_BOUND = 0; ZERO_BOUND = 0
TEST_SIZE = 0.2

## **The Dataset and Correlation Matrix**

In [4]:
portugeseDF = pd.read_csv('../data/Portuguese.csv')

le = LabelEncoder()
for col in portugeseDF.select_dtypes(include=['object']).columns:
    portugeseDF[col] = le.fit_transform(portugeseDF[col])
    
correlation_matrix = portugeseDF.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), k=1)
correlation_matrix = correlation_matrix.mask(mask)

## **Different Models**

In [5]:
logisticRegressionModel = LogisticRegression(max_iter=10000)
SVCModel = SVC(kernel='rbf') 
randomForestModel = RandomForestClassifier(n_estimators=100, random_state=42)
gradientBoostingModel = GradientBoostingClassifier(n_estimators=100, random_state=42)

modelsDictionary = {"Logistic Regression": logisticRegressionModel, "Support Vector Machine":SVCModel, "Random Forest": randomForestModel, "Gradient Boosting": gradientBoostingModel}

## **Functions to Create New Dataset and Run the Models**

In [6]:
def makingNewDataset(correlationMatrix, dataframe):
    orignalAttributes = dict(correlationMatrix.loc[TARGET])
    newAttributes = list()

    for key in orignalAttributes:
        if orignalAttributes[key] > ONE_BOUND or orignalAttributes[key] < -(ONE_BOUND): pass
        elif orignalAttributes[key] < ZERO_BOUND and orignalAttributes[key] > -(ZERO_BOUND): pass
        else: newAttributes.append(key)
    newAttributes.append(TARGET)

    newData = {}
    for attribute in newAttributes:
        newData[attribute] = dataframe[attribute]
    newPortugeseDF = pd.DataFrame(newData)
    return newPortugeseDF

In [7]:
def runTheModel (dataframe, modelName, modelToRun):
    features = dataframe.drop(columns=[TARGET]); target = dataframe[TARGET]
    featureTrain, featureTest, targetTrain, targetTest = train_test_split(features, target, test_size=TEST_SIZE, random_state=42)

    modelToRun.fit(featureTrain, targetTrain)

    testPredictions = modelToRun.predict(featureTest)
    accuracy = accuracy_score(targetTest, testPredictions)
    if modelName == "Support Vector Machine":
        print(f"{modelName} Accuracy: \t{accuracy * 100:.3f}%")
    else:
        print(f"{modelName} Accuracy: \t\t{accuracy * 100:.3f}%")

## **Test Section**

In [8]:
# Different Tests to decide on best hyper parameters

# Test 1
ONE_BOUND =0.8
ZERO_BOUND = 0.1
print(f"- - When correlation values in range: {ONE_BOUND} to {ZERO_BOUND} and -{ZERO_BOUND} to -{ONE_BOUND} - - ")

newDataset = makingNewDataset(correlation_matrix, portugeseDF)
print(f"Number of Removed Columns: {len(list(portugeseDF.keys()) )-  len(list(newDataset.keys()))}")
for modelName in modelsDictionary:
    runTheModel(newDataset, modelName, modelsDictionary[modelName])
print()

# Test 2
ONE_BOUND =0.8
ZERO_BOUND = 0.05
print(f"- - When correlation values in range: {ONE_BOUND} to {ZERO_BOUND} and -{ZERO_BOUND} to -{ONE_BOUND} - - ")

newDataset = makingNewDataset(correlation_matrix, portugeseDF)
print(f"Number of Removed Columns: {len(list(portugeseDF.keys()) )-  len(list(newDataset.keys()))}")
for modelName in modelsDictionary:
    runTheModel(newDataset, modelName, modelsDictionary[modelName])
print()

# Test 3
ONE_BOUND =0.8
ZERO_BOUND = 0.06
print(f"- - When correlation values in range: {ONE_BOUND} to {ZERO_BOUND} and -{ZERO_BOUND} to -{ONE_BOUND} - - ")

newDataset = makingNewDataset(correlation_matrix, portugeseDF)
print(f"Number of Removed Columns: {len(list(portugeseDF.keys()) )-  len(list(newDataset.keys()))}")
for modelName in modelsDictionary:
    runTheModel(newDataset, modelName, modelsDictionary[modelName])
print()

# Test 4
ONE_BOUND =0.8
ZERO_BOUND = 0.07
print(f"- - When correlation values in range: {ONE_BOUND} to {ZERO_BOUND} and -{ZERO_BOUND} to -{ONE_BOUND} - - ")

newDataset = makingNewDataset(correlation_matrix, portugeseDF)
print(f"Number of Removed Columns: {len(list(portugeseDF.keys()) )-  len(list(newDataset.keys()))}")
for modelName in modelsDictionary:
    runTheModel(newDataset, modelName, modelsDictionary[modelName])
print()

# Test 5
ONE_BOUND =0.8
ZERO_BOUND = 0.08
print(f"- - When correlation values in range: {ONE_BOUND} to {ZERO_BOUND} and -{ZERO_BOUND} to -{ONE_BOUND} - - ")

newDataset = makingNewDataset(correlation_matrix, portugeseDF)
print(f"Number of Removed Columns: {len(list(portugeseDF.keys()) )-  len(list(newDataset.keys()))}")
for modelName in modelsDictionary:
    runTheModel(newDataset, modelName, modelsDictionary[modelName])
print()

# Test 6
ONE_BOUND =0.8
ZERO_BOUND = 0.09
print(f"- - When correlation values in range: {ONE_BOUND} to {ZERO_BOUND} and -{ZERO_BOUND} to -{ONE_BOUND} - - ")

newDataset = makingNewDataset(correlation_matrix, portugeseDF)
print(f"Number of Removed Columns: {len(list(portugeseDF.keys()) )-  len(list(newDataset.keys()))}")
for modelName in modelsDictionary:
    runTheModel(newDataset, modelName, modelsDictionary[modelName])
print()

- - When correlation values in range: 0.8 to 0.1 and -0.1 to -0.8 - - 
Number of Removed Columns: 16
Logistic Regression Accuracy: 		16.923%
Support Vector Machine Accuracy: 	21.538%
Random Forest Accuracy: 		12.308%
Gradient Boosting Accuracy: 		14.615%

- - When correlation values in range: 0.8 to 0.05 and -0.05 to -0.8 - - 
Number of Removed Columns: 5
Logistic Regression Accuracy: 		17.692%
Support Vector Machine Accuracy: 	18.462%
Random Forest Accuracy: 		16.923%
Gradient Boosting Accuracy: 		16.923%

- - When correlation values in range: 0.8 to 0.06 and -0.06 to -0.8 - - 
Number of Removed Columns: 9
Logistic Regression Accuracy: 		14.615%
Support Vector Machine Accuracy: 	19.231%
Random Forest Accuracy: 		19.231%
Gradient Boosting Accuracy: 		17.692%

- - When correlation values in range: 0.8 to 0.07 and -0.07 to -0.8 - - 
Number of Removed Columns: 11
Logistic Regression Accuracy: 		16.923%
Support Vector Machine Accuracy: 	17.692%
Random Forest Accuracy: 		15.385%
Gradient Bo