# Imported Libraries

In [3]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score,accuracy_score, precision_score, recall_score
import pandas as pd
import numpy as np

# Helper Functions

In [None]:
def process_and_load_data(data, target = str):
    #The input should only be a Pandas DataFrame 
    try:
        #This creates split datasets for training, testing, and validation
        #Additionally it prepares the input data sets for model fitting and predicting
        (data == pd.DataFrame)
        X = data.drop(target, axis = 1)
        y = data[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        X_val =  scaler.transform(X_val)

        return X_train, X_test, X_val, y_train, y_test, y_val
    
    #Throws Error if Input is not a Pandas DataFrame
    except:
        print("Input needs to be a Pandas DataFrame.")

In [2]:
def model_assessment(input, predictions, actuals):
    
    male_indices = input[input['Gender'] == 1].index.tolist()
    female_indices = input[input['Gender'] == 2].index.tolist()

    #Overall
    overall_accuracy = accuracy_score(actuals, predictions)
    overall_recall = recall_score(actuals, predictions)
    overall_precision = precision_score(actuals, predictions)

    #Female
    female_preds = predictions[female_indices]
    female_actuals = actuals[female_indices]
    female_accuracy = accuracy_score(female_actuals, female_preds)
    female_recall = recall_score(female_actuals, female_preds)
    female_precision = precision_score(female_actuals, female_preds)

    #Male
    male_preds = predictions[male_indices]
    male_actuals = actuals[male_indices]
    male_accuracy = accuracy_score(male_actuals, male_preds)
    male_recall = recall_score(male_actuals, male_preds)
    male_precision = precision_score(male_actuals, male_preds)

    results = pd.DataFrame({
        'Group': ['Overall', 'Female', 'Male'],
        'Accuracy': [overall_accuracy, female_accuracy, male_accuracy],
        'Recall': [overall_recall, female_recall, male_recall],
        'Precision': [overall_precision, female_precision, male_precision]
    })

    return results

# Base Logistic Regression Model

In [4]:

log_reg = sklearn.linear_model.LogisticRegression()