### **Imports**

In [139]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

import plotly.graph_objects as go
from plotly.subplots import make_subplots

### **Constants**

In [140]:
TARGET = "failures" # the target variable

# ONE_BOUND is the bounding area from the extremes (ie. 1 and -1), to remove the models that are too correlated with the target
# ZERO_BOUND is the bounding area from the middle (ie. 0), to remove the models that are too uncorrelated with the target
ONE_BOUND = 0.5 ; ZERO_BOUND = 0.065 
ONE_BOUND = 1 ; ZERO_BOUND = 0


TEST_SIZE = 0.2 # percentage of the dataset to be used as a test set

### **Data Frame and Correlation Matrix**

In [141]:
portugeseDF = pd.read_csv('../data/Portuguese.csv')

# This part is copied from "Dataset.ipynb" to get the confusion matrix
le = LabelEncoder()
for col in portugeseDF.select_dtypes(include=['object']).columns:
    portugeseDF[col] = le.fit_transform(portugeseDF[col])
    
correlation_matrix = portugeseDF.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), k=1)
correlation_matrix = correlation_matrix.mask(mask)

### **New Dataset Functions**

In [142]:
def makingNewDataset(correlationMatrix, dataframe):
    orignalAttributesRow = dict(correlationMatrix.loc[TARGET])
    orignalAttributesColumn = dict(correlationMatrix[TARGET])

    # print(orignalAttributesRow)
    # print(orignalAttributesColumn)

    newAttributes = list()
    removedAttributes = list()

    #True = Vertical, False = Horizontal
    Lineflag = False

    # iterate through features to choose which to keep
    for key in orignalAttributesRow:
        if key == TARGET:
            Lineflag = True
            newAttributes.append(key) # add target to new dataframe 
       
        #Horizontal Correlation
        if Lineflag == False:
            if orignalAttributesRow[key] > ONE_BOUND or orignalAttributesRow[key] < -(ONE_BOUND): removedAttributes.append(key) # high correlation
            elif orignalAttributesRow[key] < ZERO_BOUND and orignalAttributesRow[key] > -(ZERO_BOUND): removedAttributes.append(key) # low correlation
            else: newAttributes.append(key) # add featuers that are in the acceptable range to new dataframe

        #Vertical Correlation
        if Lineflag == True:
            if orignalAttributesColumn[key] > ONE_BOUND or orignalAttributesColumn[key] < -(ONE_BOUND): removedAttributes.append(key) # high correlation
            elif orignalAttributesColumn[key] < ZERO_BOUND and orignalAttributesColumn[key] > -(ZERO_BOUND): removedAttributes.append(key) # low correlation
            else: newAttributes.append(key) # add featuers that are in the acceptable range to new dataframe
    
    # make new dataframe
    newData = {}
    for attribute in newAttributes:
        newData[attribute] = dataframe[attribute]
    newPortugeseDF = pd.DataFrame(newData)

    return [newPortugeseDF, newAttributes, removedAttributes] # return new dataframe, list of kept attributes and list of removed attributes

In [None]:
temp = makingNewDataset(correlation_matrix, portugeseDF); 
newDataset = temp[0]; 
keptAttributes = temp[1]; 
removedAttributes = temp[2]

### **Model**

In [144]:
def runTheModel (dataframe, modelName, modelToRun):
    features = dataframe.drop(columns=[TARGET]); target = dataframe[TARGET]
    featureTrain, featureTest, targetTrain, targetTest = train_test_split(features, target, test_size=TEST_SIZE, random_state=42)
    
    # Standardize features
    scaler = StandardScaler()
    featureTrain = scaler.fit_transform(featureTrain)
    featureTest = scaler.transform(featureTest)

    modelToRun.fit(featureTrain, targetTrain)

    testPredictions = modelToRun.predict(featureTest)
    accuracy = accuracy_score(targetTest, testPredictions)
    print(f"{modelName} Accuracy: \t{accuracy * 100:.3f}%")

    
    # evaluate the model
    print("Confusion Matrix:\n", confusion_matrix(targetTest, testPredictions))
    print("\nClassification Report:\n", classification_report(targetTest, testPredictions))

### **Test Section**

In [145]:
LogisticRegressionModel = LogisticRegression()

import warnings
warnings.filterwarnings("ignore")

print(f"List Kept Features:\t\t\t{keptAttributes}"); print(f"List Removed Features:\t\t\t{removedAttributes}")
print(f"Original / Removed / Kept:\t\t{len(portugeseDF.keys())} / {len(removedAttributes)} / {len(keptAttributes)}")

runTheModel(newDataset, "\nLogistic Regression Model", LogisticRegressionModel)

List Kept Features:			['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']
List Removed Features:			[]
Original / Removed / Kept:		33 / 0 / 34

Logistic Regression Model Accuracy: 	84.615%
Confusion Matrix:
 [[109   2   1   0]
 [ 11   1   0   1]
 [  1   0   0   0]
 [  3   1   0   0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.97      0.92       112
           1       0.25      0.08      0.12        13
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         4

    accuracy                           0.85       130
   macro avg       0.28      0.26      0.26       130
weighted avg       0.78