# Holds functions for pre-processing and machine learning models
**Author:** Natalie Brown 


**Contents:**
* **Pre-processing**
    * normalization techniques
    * OneHot Encoding
* **Feature Selection**
    * PCA (principal component analysis)
* **Machine Learning**
    * linear regression
    * logistic regression
    * decision trees
    * extreme gradient boosting
* **Evaluation Metrics**
    * linear metrics
    * classification metrics


---
# Import libraries

In [111]:
# wrangling
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# pre-processing
from sklearn.preprocessing import StandardScaler, FunctionTransformer, Normalizer # transform / normalize
from sklearn.preprocessing import OneHotEncoder # encode categorical features


# feature selection
from sklearn.decomposition import PCA

# modeling
from sklearn.model_selection import train_test_split # split data
from sklearn.linear_model import LogisticRegression, LinearRegression # linear
from sklearn.tree import DecisionTreeClassifier # tree
from sklearn.ensemble import GradientBoostingClassifier # ensemble
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, mean_squared_error, r2_score # evaluation metrics

---
# Load data

In [114]:
# loads data
def load_data(file_name, file_type, sheet_name=None):
    if file_type == 'csv':
        df = pd.read_csv(file_name, sep='delimeter', encoding='utf-8')

    elif file_type == 'excel':
        df = pd.read_excel(file_name, sheet_name=sheet_name)

    else:
        print(f'Enter file type\n-csv\n-excel')
        df = None

    return df

---
# Pre-Processing

### normalization
---

In [49]:
''''
takes a data frame, a list with column(s) to normalize, and the normalization method as input
returns the original dataframe merged with the new normalized columns
normalized columns are denoted by NORM

method should be one of these:
- NORM (normalization)
- LN (natural log transformation)
- SCALE (z-score standardization)
'''

def normalize_features(df, cols_to_norm, method):
    
    if method == 'SCALE':
        
        # initialize scaler 
        sc = StandardScaler()
    
        # apply and create new columns
        prepro_data = sc.fit_transform(df[cols_to_norm])
        prepro_df = pd.DataFrame(prepro_data, columns=[col + '_SCALED' for col in cols_to_norm])

        # merge
        output_df = pd.concat([df, prepro_df], axis=1)

    elif method == 'LN':

        # intialize Log Transformer
        ln = FunctionTransformer(np.log, validate=True)

        # apply and create new columns
        prepro_data = ln.fit_transform(df[cols_to_norm])
        prepro_df = pd.DataFrame(prepro_data, columns=[col + '_LN' for col in cols_to_norm])

        # merge
        output_df = pd.concat([df, prepro_df], axis=1)

    elif method == 'NORM':

        # intialize normalization
        nm = Normalizer()

        # apply and create new columns
        prepro_data = nm.fit_transform(df[cols_to_norm])
        prepro_df = pd.DataFrame(prepro_data, columns=[col + '_NORM' for col in cols_to_norm])

        # merge
        output_df = pd.concat([df, prepro_df], axis=1)

    else:

        print('no / incorrect method chosen\nenter:- NORM\n-LN\n-SCALE')

        output_df = df.copy()
        
    # Combine the original DataFrame with the scaled DataFrame
    return output_df


### categorical encoding
---

In [43]:
''' 
takes the data frame, features to be encoded
returns data frame with the original features dropped, replaced with the new dummies
'''

def encode_categorical(df, cols_to_encode):

    # intitialize encoder
    encoder = OneHotEncoder(drop='if_binary', sparse=False)

    # apply encoder
    encoded_data = encoder.fit_transform(df[cols_to_encode])

    # create data frame from encoded data
    encoded_columns = encoder.get_feature_names_out(cols_to_encode)
    encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns, index=df.index)
    
    # Drop original columns from the DataFrame and merge the encoded DataFrame
    return pd.concat([df.drop(columns=cols_to_encode), encoded_df], axis=1)


---
# Feature Selection

### principle component analysis
---

In [59]:
'''
function to perform pca, which helps determine which features ot use, especially helpful with highly correlated data
takes data frame and number of principal components as input
returns a dataframe with the PCs
'''

def pca(df, n_components):

    # intialize
    pca = PCA(n_components=n_components)

    # fit to data
    pca.fit(df)

    # get pca data
    pca_data = pca.transform(df)


    columns = [f'PC{n + 1}' for n in range(n_components)]


    # put results into a dataframe
    pca_df = pd.DataFrame(pca_data, columns=columns)

    # get loadings
    loadings = pca.components_
    
    loadings = pd.DataFrame(
        pca.components_,
        columns=df.columns, # names from the original DataFrame
        index=[f'PC{i+1}' for i in range(pca.components_.shape[0])]  # Names for each principal component
    )
    
    for pc in loadings.index:
        print(f"\n{pc}:")
        # Sort features by their absolute contribution to the current PC
        top_features = loadings.loc[pc].abs().sort_values(ascending=False).head(4)
        for feature, loading in zip(top_features.index, top_features.values):
            print(f"Feature: {feature}, Loading: {loading:.4f}")

    # return the data frame
    return pca_df
    

---
# Machine Learning

### linear regression
---

In [65]:
# linear regression model
def linear_regression(df,target,train_size):
    # define target and inputs
    X = df.drop(columns=[target]) # inputs
    y = df[target] # target

    # split the data intro the training set
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(1-train_size))
    
    # split the remaining 30% into validation and test set
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=.5)
    
    # sanity check for training size
    training_size = (X_train.shape[0] / df.shape[0]) * 100
    test_size = (X_test.shape[0] / df.shape[0]) * 100
    validation_size = (X_val.shape[0] / df.shape[0]) * 100

    print(f'''
    Training: {training_size:.2f}%
    Test: {test_size:.2f}%
    Validation: {validation_size:.2f}%\n\n''')
    
    # initialize model and train
    lr = LinearRegression() # intialize

    return X_val, X_test, y_val, y_test, X_train, y_train, lr.fit(X_train, y_train) # train
    

### logistic regression
---

In [67]:
# define function for logistic regression model
def logistic_regression(df,target,train_size,test_size):
    # define target and inputs
    X = df.drop(columns=[target]) # inputs
    y = df[target] # target
    
    # split the data intro the training set
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(1-train_size))

    # determine the test size proportion
    n = test_size / (1 - train_size)
    
    # split the remaining 30% into validation and test set
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=n)
    
    # sanity check for training size
    training_size = (X_train.shape[0] / df.shape[0]) * 100
    test_size = (X_test.shape[0] / df.shape[0]) * 100
    validation_size = (X_val.shape[0] / df.shape[0]) * 100
    
    print(f'''
    Training: {training_size:.2f}%
    Test: {test_size:.2f}%
    Validation: {validation_size:.2f}%\n\n''')
    
    # initialize model and train
    lg = LogisticRegression()

    return X_val, X_test, y_val, y_test, X_train, y_train, lg.fit(X_train, y_train) # train

### decision tree
---

* **DecisionTreeClassifier**(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, monotonic_cst=None)

In [72]:
def decision_tree(df,target,train_size, depth, minimum_splits, minimum_leaves):
    # define target and inputs
    X = df.drop(columns=[target]) # inputs
    y = df[target] # target

    # split the data intro the training set
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(1-train_size))
    
    # split the remaining 30% into validation and test set
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=.5)
    
    # sanity check for training size
    training_size = (X_train.shape[0] / df.shape[0]) * 100
    test_size = (X_test.shape[0] / df.shape[0]) * 100
    validation_size = (X_val.shape[0] / df.shape[0]) * 100
    
    print(f'''
    Training: {training_size:.2f}%
    Test: {test_size:.2f}%
    Validation: {validation_size:.2f}%\n\n''')
    
    # initialize model and train
    dt = DecisionTreeClassifier(max_depth=depth, min_samples_split=minimum_splits, min_samples_leaf=minimum_leaves) # intialize
    
    return X_val, X_test, y_val, y_test, X_train, y_train, dt.fit(X_train, y_train) # train

### gradient boosting
---

* **GradientBoostingClassifier**(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

In [78]:
def gradient_boost(df, target, train_size, estimators, learning_rate, max_depth): # learning rate should be a decimal
    # define target and inputs
    X = df.drop(columns=[target]) # inputs
    y = df[target] # target

    # split the data intro the training set
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(1-train_size))
    
    # split the remaining 30% into validation and test set
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=.5)
    
    # sanity check for training size
    training_size = (X_train.shape[0] / df.shape[0]) * 100
    test_size = (X_test.shape[0] / df.shape[0]) * 100
    validation_size = (X_val.shape[0] / df.shape[0]) * 100
    
    print(f'''
    Training: {training_size:.2f}%
    Test: {test_size:.2f}%
    Validation: {validation_size:.2f}%\n\n''')
    
    # initialize model and train
    xgb = GradientBoostingClassifier(n_estimators=estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=0) # intialize
    
    return X_val, X_test, y_val, y_test, X_train, y_train, xgb.fit(X_train, y_train) # train

---
# Evalutation Metrics

### classification metrics
---

In [86]:
def evaluate_classification(X_val, X_test, y_val, y_test, X_train, y_train, model):
    
    # Training Evaluation
    y_train_pred = model.predict(X_train)
    y_train_prob = model.predict_proba(X_train)[:, 1]  # Get probabilities for ROC AUC calculation
    
    print(f"{'-'*55}\nTraining Evaluation:\n")
    
    # Precision, Recall, and F1-Score
    train_report_dict = classification_report(y_train, y_train_pred, output_dict=True)
    train_metrics = pd.DataFrame(train_report_dict).transpose()
    train_metrics = train_metrics.loc[['0', '1'], ['precision', 'recall', 'f1-score']]  # Include only class labels (0 and 1)
    print(train_metrics)
    
    # AUC
    train_auc = roc_auc_score(y_train, y_train_prob)
    print(f"\nAUC (Area Under Curve): {train_auc:.4f}")
    
    # Validation Evaluation
    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:, 1]
    
    print(f"\n{'-'*55}\nValidation Evaluation:\n")
    
    # Precision, Recall, and F1-Score
    val_report_dict = classification_report(y_val, y_val_pred, output_dict=True)
    val_metrics = pd.DataFrame(val_report_dict).transpose()
    val_metrics = val_metrics.loc[['0', '1'], ['precision', 'recall', 'f1-score']]  # Include only class labels (0 and 1)
    print(val_metrics)
    
    # AUC
    val_auc = roc_auc_score(y_val, y_val_prob)
    print(f"\nAUC (Area Under Curve): {val_auc:.4f}")
    
    # Test Evaluation
    y_test_pred = model.predict(X_test)
    y_test_prob = model.predict_proba(X_test)[:, 1]
    
    print(f"\n{'-'*55}\nTest Evaluation:\n")
    
    # Precision, Recall, and F1-Score
    test_report_dict = classification_report(y_test, y_test_pred, output_dict=True)
    test_metrics = pd.DataFrame(test_report_dict).transpose()
    test_metrics = test_metrics.loc[['0', '1'], ['precision', 'recall', 'f1-score']]  # Include only class labels (0 and 1)
    print(test_metrics)
    
    # AUC
    test_auc = roc_auc_score(y_test, y_test_prob)
    print(f"\nAUC (Area Under Curve): {test_auc:.4f}\n\n")

    # calculate ROC for each set
    fpr_train, tpr_train, _ = roc_curve(y_train, model.predict_proba(X_train)[:, 1])
    train_auc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
    
    # Validation set
    fpr_val, tpr_val, _ = roc_curve(y_val, model.predict_proba(X_val)[:, 1])
    val_auc = roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])
    
    # Test set
    fpr_test, tpr_test, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    test_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    
    # plot ROC curves
    plt.figure(figsize=(10, 6))
    plt.plot(fpr_train, tpr_train, label=f"Training Set (AUC = {train_auc:.4f})", color='blue')
    plt.plot(fpr_val, tpr_val, label=f"Validation Set (AUC = {val_auc:.4f})", color='orange')
    plt.plot(fpr_test, tpr_test, label=f"Test Set (AUC = {test_auc:.4f})", color='green')
    
    # Add diagonal line for random guess
    plt.plot([0, 1], [0, 1], 'k--')
    
    # Customize the plot
    plt.title("ROC Curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc="lower right")
    plt.grid(alpha=0.3)
    
    # Show the plot
    plt.show()

### linear regression metrics
---

In [90]:
def evaluate_linear_regression(data, target, X_val, X_test, y_val, y_test, X_train, y_train, model):

    for dataset, X, y in [("Training", X_train, y_train), 
                          ("Validation", X_val, y_val), 
                          ("Test", X_test, y_test)]:
        
        print(f"\n{'-'*55}\n{dataset} Evaluation:\n")
        
        # Predictions
        y_pred = model.predict(X)
    
        # Metrics: MSE and R-squared
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        
        print(f"Mean Squared Error (MSE): {mse:.4f}")
        print(f"R-squared: {r2:.4f}\n\n")

    # Predictions
    prediction = model.predict(data.drop(columns=[target]))

    # Actual value
    actual = data[target]
    
    # plotting residuals
    plt.figure(figsize=(8, 6))

    # getting residuals
    residuals = actual - prediction

    # scatter plot
    plt.scatter(prediction, residuals, color='green', alpha=0.6, edgecolor='black')
    plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title('Residual Plot')
    plt.grid(alpha=0.4)
    plt.tight_layout()
    plt.show()


    # plotting predicted vs actual
    plt.figure(figsize=(8, 6))
    plt.scatter(prediction, actual, color='skyblue', alpha=0.6, edgecolor='black')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)  # Diagonal line
    plt.xlabel('Predicted Values')
    plt.ylabel('Actual Values')
    plt.title('Actual vs Predicted Values')
    plt.grid(alpha=0.4)
    plt.tight_layout()
    plt.show()

### coefficients and feature importance
---

In [95]:
# function to get coefficients
def coefficients(df,target,model):
    coefficients = model.coef_[0]  # LogisticRegression stores coefficients as an array
    feature_names = df.drop(columns=[target]).columns  # Retrieve the feature names from the training data
    
    # Create a DataFrame for better visualization
    coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
    

    # Sort coefficients by magnitude for better visualization
    coefficients_df = coefficients_df.sort_values(by='Coefficient', key=abs, ascending=False)
    
    # Create a bar plot
    plt.figure(figsize=(10, 6))
    plt.barh(coefficients_df['Feature'], coefficients_df['Coefficient'], color='skyblue')
    plt.xlabel("Coefficient Value")
    plt.ylabel("Feature")
    plt.title(f"Feature Coefficients in {model_type} Model")
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)  # Add a vertical line at x=0 for reference
    plt.tight_layout()
    plt.show()

    return coefficients_df.sort_values(by=['Coefficient'],ascending=True)