In [1]:
# Necessary to display the plots in the notebook
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
from IPython.display import display_html, display_markdown, HTML, Markdown as md
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.lines import Line2D
import math
import re
from scipy import stats
import pickle
from joblib import dump
import time
from typing import Union

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import KFold,GridSearchCV, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

# Helper functions defined in the helper.py file
import helper as hp

In [2]:
%%html
<style>
@import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,100..1000;1,9..40,100..1000&display=swap');
div.text_cell {
    font-family : DM Sans, sans-serif !important;
    font-size : 1.2em !important;
}
pre {font-family : DM Sans, sans-serif !important;}
</style>

# **Regression Model Comparison Template** 
This Notebook compares the performance of different types of regression models on a dataset provided by the user. It assists in the model selection process by streamlining data cleaning, data preprocessing, model training, and model evaluation. Throughout the template there are sections the user must configure to match characteristics of their dataset. These sections are preceeded by tripple quote comments that direct them on how to proceed. Users should store files containing their data in the data directory. The load_data function automatically prepends /data to the given file name.

## **Load and Reformat the Dataset**

### **Configure Data Loading Variables**

In [None]:
"""
Input the name of the file containing your dataset, a list of the name of columns to drop, and a list of
the representations of missing values in the dataset.
"""
data_file_name: str = "(ex: data.csv)" 
columns_to_drop: list[str] = [] 

# Ensure no valid dataset values are included in this list
dataset_na_value_representations = ['', 'NA', 'N/A', 'null', 'NULL', 'NaN', 'none', 'None', '-', '?']

### **Load Data**

In [None]:
data_df: pd.DataFrame = hp.load_data(data_file_name, columns_to_drop, dataset_na_value_representations)

initial_number_of_entries: int = len(data_df) 
variable_list: list[str] = list(data_df.columns)
number_of_variables: int = len(variable_list)

numerical_variables: list[str] = list(data_df.select_dtypes(include = np.number).columns)
categorical_variables: list[str] = list(data_df.select_dtypes(exclude = np.number).columns)

hp.display_text(f"Numerical Variables: {numerical_variables}", font_size = 16)
hp.display_text(f"Categorical Variables: {categorical_variables}", font_size = 16)

hp.display_df(data_df.head(), font_size = 14)

### **Reformat Columns**
Do not run this cell if the values of your dataset are already properly formatted. If not (e.g. columns that should be numerical are instead represented as strings), open the cell and configure this section to reformat any imporperly formatted columns.

In [None]:
"""
Apply value cleaning/conversion functions to the relevant columns of your datset using the apply 
method on your DataFrame. Your cleaing/conversion functions should take in a single string and return a float.
Pass them into the apply method as an argument without parentheses.
Example: data_df['column_name'] = data_df['column_name'].apply(clean_function).
"""



hp.display_text(f"Previous List of Numerical Variables: {numerical_variables}", font_size = 18)
hp.display_text(f"Previous List of Categorical Variables: {categorical_variables}", font_size = 18)
print()

numerical_variables: list[str] = list(data_df.select_dtypes(include = np.number).columns)
categorical_variables: list[str] = list(data_df.select_dtypes(exclude = np.number).columns)

hp.display_text(f"Updated List of Numerical Variables: {numerical_variables}", font_size = 18)
hp.display_text(f"Updated List of Categorical Variables: {categorical_variables}", font_size = 18)

hp.display_df(data_df.head())

## **Handle Missing Values**

### **Initial Data Profiling**
The following cell uses ydata-profiling to generate a detailed report on the characteristics of the input dataset. Use this information to help you determine how to handle missing values.

In [None]:
data_df[categorical_variables] = data_df[categorical_variables].astype("category")

initial_dataset_report = ProfileReport(data_df, title = "Dataset Profiling Report (Before Handling Outliers/Missing Values)", progress_bar = False, explorative = True)
initial_dataset_report

### **Information on Missing Values in Dataset**

In [None]:
numerical_columns_with_missing_values: list[str] = data_df[numerical_variables].columns[data_df[numerical_variables].isnull().any()].tolist()
categorical_columns_with_missing_values: list[str] = data_df[categorical_variables].columns[data_df[categorical_variables].isnull().any()].tolist()
all_columns_with_missing_values: list[str] = numerical_columns_with_missing_values + categorical_columns_with_missing_values

if len(all_columns_with_missing_values) != 0:
    print()
    entries_with_missing_values_df: pd.DataFrame = data_df[all_columns_with_missing_values][data_df[all_columns_with_missing_values].isnull().any(axis = "columns")]
    number_of_entries_with_missing_values: int = len(entries_with_missing_values_df)
    percent_of_entries_with_missing_values: float = (number_of_entries_with_missing_values / initial_number_of_entries) * 100  
    
    hp.display_text(f"Total Number of Entries: {initial_number_of_entries}")
    hp.display_text(f"Total Number of Entreis with at Least One Missing Value: {number_of_entries_with_missing_values} ({percent_of_entries_with_missing_values:.2f}% of Entries)")
    hp.display_text(f"Number of Entries if all Rows with Missing Values are Dropped: {initial_number_of_entries - number_of_entries_with_missing_values}")
    print()
    hp.display_text("Up to First 5 Entries with Missing Values:")
    hp.display_df(entries_with_missing_values_df.head(), font_size = 16)
else:
    print()
    hp.display_text("No Missing Values in Dataset")

### **Drop or Impute Missing Values**

In [None]:
"""
Specify how you would like to handle missing values in the dataset. All rows with missing data are dropped by default. Will it work
"""
data_df = hp.drop_rows_with_missing_values(data_df, all_columns_with_missing_values)

## **Handle Outliers/Eronious Entries**

### **Implement Outlier Handling**

In [None]:
# Provides information on variable distributions to help users determine whether they should drop outlier entries
hp.display_text("Previous Numerical Variable Statistics", font_size = 20)
hp.display_df (data_df.describe(), 16)

In [None]:
"""
Use the visualize_outliers function to identify and optionally remove outliers in the numerical columns of your dataset.
"""
data_df = hp.visualize_outliers(data_df, numerical_variables)


In [None]:
hp.display_text("Updated Numerical Variable Statistics", font_size = 20)
hp.display_df(data_df.describe(), 16)

## **Data Preprocessing**
Define your preprocessing steps within this section.

### **Dataset Preprocessing Information**
Use the information provided by the comparison profile report to analyze the result of your data cleaning and to help determine your preprocessing steps.

In [None]:
dataset_report = ProfileReport(data_df, title = "Dataset Profiling Report (After Handling Outliers/Missing Values)", progress_bar = False, explorative = True)
# The difference between the profiling report before and after preprocessing may not be very significant depending on the number of missing values
# and removed outliers
post_cleaning_comparison_report = dataset_report.compare(initial_dataset_report)

#post_cleaning_comparison_report
dataset_report

### **Configure Preprocessing Steps for the Provided General Preprocessor**
The provided general preprocessor address three common preprocessing transformations: scaling numerical variables, one-hot encoding nominal categorical variables, and ordianal encoding ordinal categorical variables. Customize these steps to fit the needs of your dataset. Preprocessing for your target variable and feature variables must be handled by seperate transformer variables.

In [None]:
"""
Input the name of the target variable column, a list of the numerical features you would like to scale (defaultes to all numerical
features in the dataset), and a list of the nominal categorical features you would like to one-hot encode.
"""
target_column_name: str = ""
numerical_features_to_scale: list[str] = list(set(numerical_variables) - set([target_column_name]))
nominal_categorical_features_to_encode: list[str] = []


"""
The ordianl_categories_ordered_dict variable represents the order of ordinal categorical variable categories in a dictionary.
For the keys of this dictionary, input the column names of ordinal categorical variables. Each key's value should be a lists
of variable categories ordered from "smallest" to "largest". Example:
ordianl_categories_ordered_dict = {
    "size": ["small", "medium", "large"],
    "grade": ["F", "D", "C", "B", "A]
}
"""
ordianl_categories_ordered_dict: dict[str, list[str]] = {}


"""
Specify the preprocessor used on the target variable column. Default is StandardScaler.
"""
numerical_target_preprocessor = StandardScaler()

### **General Preprocessor Initialization**

In [None]:
# Extracts the keys and values of the ordianl_categories_ordered_dict into separate lists
ordianl_feature_categories_to_encode: list[str] = list(ordianl_categories_ordered_dict.keys())
ordianl_feature_categories_orders_lists: list[list[str]] = list(ordianl_categories_ordered_dict.values())

# Indicates that the first column of one-hot encoded variables should be dropped to avoid multicollinearity
onehot_drop_column: str = "first"

# The argumnet for the transformers parameter of ColumnTransformer must be a a list of touples with three entries. Each of these touples
# represents a preprocessing step. The first entry of each touple is a name for the step. The second entry is the transformer object, and
# the final entry is a list of the columns the step should be applied to.
general_feature_preprocessor = ColumnTransformer(
    transformers = [
        ('numerical_scaler', StandardScaler(), numerical_features_to_scale),
        ('nominal_encoder', OneHotEncoder(drop = onehot_drop_column), nominal_categorical_features_to_encode),
        ("ordinal_encoder", OrdinalEncoder(categories = ordianl_feature_categories_orders_lists), ordianl_feature_categories_to_encode)
    ]
)

### **Preprocessing Results**

In [None]:
hp.display_text(f"Scaled Numerical Variables: {numerical_features_to_scale}")
hp.display_text(f"Encoded Nominal Categorical Variables: {nominal_categorical_features_to_encode}")
hp.display_text(f"Encoded Ordinal Categorical Variables (confirm that category orders were assigned to the correct ordinal categorical variable):")
if len(ordianl_feature_categories_to_encode) != 0:
    for i in range(len(ordianl_feature_categories_to_encode)):
        display_markdown(md(f"* {ordianl_feature_categories_to_encode[i]}: {ordianl_feature_categories_orders_lists[i]}"))
else:
    hp.display_text("None")

## **Load Models and Set Hyperparameters**

In [None]:
"""
Configure the models you would like to use for regression.
"""

linear_model_data = {
    'Linear': {
        'model': make_pipeline(general_feature_preprocessor, LinearRegression()),
        'param_grid': {}  
    }
}


alpha_values = np.linspace(0.01, 100, num = 100)

lasso_model_data = {
    'Lasso': {
        'model': make_pipeline(general_feature_preprocessor, Lasso()),
        'param_grid': {'lasso__alpha': alpha_values}
    }
}


ridge_model_data = {
    'Ridge': {
        'model': make_pipeline(general_feature_preprocessor, Ridge()),
        'param_grid': {'ridge__alpha': alpha_values}
    }
}


random_forest_model_data = {
    'Random Forest': {
        'model': make_pipeline(general_feature_preprocessor, RandomForestRegressor(random_state = 42)),
        'param_grid': {
            'randomforestregressor__n_estimators': [100, 200],
            'randomforestregressor__max_depth': [None, 20],
            'randomforestregressor__max_features': [1.0, "sqrt"]
        }
    }
}


svr_model_data = {
    'SVR': {
        'model': make_pipeline(general_feature_preprocessor, SVR()),
        'param_grid': {
            'svr__C': [.1, 1, 10]  
        }
    }
}


"""
Use the unpacking operator (**) to place all of your models in the models dictionary.
"""
models = {
    **linear_model_data,
    **lasso_model_data,
    **ridge_model_data,
    **random_forest_model_data,
    **svr_model_data
}  

### **Split Dataset into Training and Testing Sets**

In [None]:
num_decimal_places: int = 7 # Determines the number of decimal places to display for model evaluation metrics

# Splits dataset into feature variables and target variable
X: pd.DataFrame = data_df.drop(columns = [target_column_name])
y: pd.Series = data_df[target_column_name]
num_features: int = len(X.columns)

# Users can change the argument passed into the test_size parameter to adjust the size of the testing set (currently set to 20% of the dataset)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
feature_list: list[str] = list(X_train.columns)

### **Apply Target Preprocessor**
Do not run this cell if you are not preprocessing the target variable. This cell may need to be configured if you using something other than StandardScalaer on your target variable

In [None]:
# StandardScaler() requires a multidimensional array-like object as input, so the target series is converted into DataFrame so the preprocesser can be applied
y_train: pd.DataFrame = pd.DataFrame(y_train, columns = [target_column_name])
y_test: pd.DataFrame = pd.DataFrame(y_test, columns = [target_column_name])

# Applying the StandardScaler() fit_transform method returns a 2D numpy array
y_train: np.ndarray = numerical_target_preprocessor.fit_transform(y_train)
y_test: np.ndarray  = numerical_target_preprocessor.fit_transform(y_test)

# Must convert the target variable back to a 1D array for the models to be trained, which is done through the ravel() method
y_train: np.ndarray = y_train.ravel()
y_test: np.ndarray = y_test.ravel()

### **Run Model Trianing**

In [None]:
model_results: dict[str, dict] = hp.train_and_evaluate_models(models, X_train, y_train, X_test, y_test)

### **Model Score Comparison**

In [None]:
hp.plot_comparative_model_performance(model_results)

### **Model Performance Summary and Model Selection**

In [None]:
# Perfomrmance summary function saves the name of the best performing model
best_model_name: str = hp.summarize_results(model_results)
best_model = model_results[best_model_name]['best_model']

### **Comparative Plots**

In [None]:
hp.plot_residuals_histograms_comparison(model_results, y_test)
hp.plot_residuals_scatter_comparison(model_results, y_test)
hp.plot_actual_vs_predicted_comparison(model_results, y_test, target_column_name)

## **Individual Model Analysis**
Use this section to get a better view of the plots of models you are interested in.

### **Linear Model**

In [None]:
linear_model_name: str = "Linear"

display_model_evaluation_and_plots(linear_model_name, model_results, y_test)

### **Lasso Model**

In [None]:
lasso_model_name: str = "Lasso"

display_model_evaluation_and_plots(lasso_model_name, model_results, y_test)

### **Ridge Model**

In [None]:
ridge_model_name: str = "Ridge"

display_model_evaluation_and_plots(ridge_model_name, model_results, y_test)

### **Random Forest Model**

In [None]:
random_forest_model_name: str = "Random Forest"

display_model_evaluation_and_plots(random_forest_model_name, model_results, y_test)

### **Support Vector Model**

In [None]:
svr_model_name: str = "SVR"

display_model_evaluation_and_plots(svr_model_name), model_results, y_test

## **Save Best Model**

In [None]:
"""
Configure the name of the model you want to save (best_model_name by default), the name of the file
that the model should be saved to, and the save method.
"""
name_of_model_to_save = best_model_name
save_file_name: str = ""
save_method: str = "" # Options: "pickle" or "joblib"

### **Save Model**

In [None]:
hp.save_model(name_of_model_to_save, best_model, save_file_name, save_method)