In [2]:
# Necessary to display the plots in the notebook
%matplotlib inline

import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
from IPython.display import Markdown as md
from IPython.display import display, display_markdown, Markdown as md


from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans

# Helper functions defined in the helper.py file
import helper as hp

In [None]:
# Use to reload the helper functions when the helper.py file is modified
import importlib
importlib.reload(hp)

In [3]:
%%html
<style>
@import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,100..1000;1,9..40,100..1000&display=swap');
div.text_cell {font-family : DM Sans, sans-serif !important;}
pre {font-family : DM Sans, sans-serif !important;}
</style>

# **Clustering Model Comparison Template** 
This Notebook compares the performance of different types of clustering models on a dataset provided by the user. It assists in the model selection process by streamlining data cleaning, data preprocessing, model training, and model evaluation. Throughout the template there are sections the user must configure to match characteristics of their dataset. These sections are preceeded by tripple quote comments that direct the user on how to proceed. Users should store files containing their data in the data directory. The load_data function automatically prepends /data to the given file name.

## **Load and Reformat the Dataset**

### **Configure Data Loading Variables**

In [None]:
"""
Input the name of the file containing your dataset, a list of the name of columns to drop, and a list of
the representations of missing values in the dataset.
"""
data_file_name: str = "" 
columns_to_drop: list[str] = [""] 

# Ensure no valid dataset values are included in this list
dataset_na_value_representations = ['', 'NA', 'N/A', 'null', 'NULL', 'NaN', 'none', 'None', '-', '?', "nan"]

### **Load Data**

In [None]:
data_df: pd.DataFrame = hp.load_data(data_file_name, columns_to_drop, dataset_na_value_representations)

initial_number_of_entries: int = len(data_df) 
variable_list: list[str] = list(data_df.columns)
number_of_variables: int = len(variable_list)

numerical_variables: list[str] = list(data_df.select_dtypes(include = np.number).columns)
categorical_variables: list[str] = list(data_df.select_dtypes(exclude = np.number).columns)

# Sets the valid categories for each categorical variable. Set show_categories to True to display the categories
unique_categories_dict: dict[str, list[str]] = hp.get_categories(data_df, categorical_variables, show_categories = False)
    
for variable, variable_categories in unique_categories_dict.items():
    data_df[variable] = pd.Categorical(data_df[variable], categories = variable_categories)

hp.display_text(f"Numerical Variables: {numerical_variables}", font_size = 16)
hp.display_text(f"Categorical Variables: {categorical_variables}", font_size = 16)
print()

hp.display_df(data_df.head(), font_size = 14)

### **Reformat Columns**
Do not run this cell if the values of your dataset are already properly formatted. If not (e.g. columns that should be numerical are instead represented as strings), open the cell and configure this section to reformat any imporperly formatted columns.

In [None]:
"""
Apply value cleaning/conversion functions to the relevant columns of your datset using the apply 
method on your DataFrame. Your cleaing/conversion functions should take in a single string and return a float.
Pass them into the apply method as an argument without parentheses.

Example: data_df['column_name'] = data_df['column_name'].apply(clean_function).
"""



hp.display_text(f"Previous List of Numerical Variables: {numerical_variables}", font_size = 18)
hp.display_text(f"Previous List of Categorical Variables: {categorical_variables}", font_size = 18)
print()

numerical_variables: list[str] = list(data_df.select_dtypes(include = np.number).columns)
categorical_variables: list[str] = list(data_df.select_dtypes(exclude = np.number).columns)

unique_categories_dict: dict[str, list[str]] = hp.get_categories(data_df, categorical_variables, show_category_count = False, show_categories = False)
for variable, variable_categories in unique_categories_dict.items():
    data_df[variable] = pd.Categorical(data_df[variable], categories = variable_categories)

hp.display_text(f"Updated List of Numerical Variables: {numerical_variables}", font_size = 18)
hp.display_text(f"Updated List of Categorical Variables: {categorical_variables}", font_size = 18)

hp.display_df(data_df.head())

## **Handle Missing Values**

### **Initial Data Profiling**
The following cell uses ydata-profiling to generate a detailed report on the characteristics of the input dataset. Use this information to help you determine how to handle missing values.

In [None]:
initial_dataset_report = ProfileReport(data_df, title = "Dataset Profiling Report (Before Handling Outliers/Missing Values)", progress_bar = False, explorative = True)
initial_dataset_report

### **Information on Missing Values in Dataset**

In [None]:
numerical_columns_with_missing_values: list[str] = data_df[numerical_variables].columns[data_df[numerical_variables].isnull().any()].tolist()
categorical_columns_with_missing_values: list[str] = data_df[categorical_variables].columns[data_df[categorical_variables].isnull().any()].tolist()
all_columns_with_missing_values: list[str] = numerical_columns_with_missing_values + categorical_columns_with_missing_values

if len(all_columns_with_missing_values) != 0:
    print()
    entries_with_missing_values_df: pd.DataFrame = data_df[all_columns_with_missing_values][data_df[all_columns_with_missing_values].isnull().any(axis = "columns")]
    number_of_entries_with_missing_values: int = len(entries_with_missing_values_df)
    percent_of_entries_with_missing_values: float = (number_of_entries_with_missing_values / initial_number_of_entries) * 100  
    
    hp.display_text(f"Total Number of Entries: {initial_number_of_entries}")
    hp.display_text(f"Total Number of Entreis with at Least One Missing Value: {number_of_entries_with_missing_values} ({percent_of_entries_with_missing_values:.2f}% of Entries)")
    hp.display_text(f"Number of Entries if all Rows with Missing Values are Dropped: {initial_number_of_entries - number_of_entries_with_missing_values}")
    print()
    hp.display_text("Up to First 5 Entries with Missing Values:")
    hp.display_df(entries_with_missing_values_df.head(), font_size = 16)
else:
    print()
    hp.display_text("No Missing Values in Dataset")

### **Drop or Impute Missing Values**

In [None]:
"""
Specify how you would like to handle missing values in the dataset. All rows with missing data are dropped by default. Will it work
"""
data_df = hp.drop_rows_with_missing_values(data_df, all_columns_with_missing_values)

## **Handle Outliers/Eronious Entries**

### **Implement Outlier Handling**

In [None]:
# Provides information on variable distributions to help users determine whether they should drop outlier entries
hp.display_text("Previous Numerical Variable Statistics", font_size = 20)
hp.display_df(data_df.describe(), 16)

In [None]:
"""
Use the visualize_outliers function to identify and optionally remove outliers in the numerical columns of your dataset.
"""
data_df = hp.visualize_outliers(data_df, numerical_variables)

In [None]:
hp.display_text("Updated Numerical Variable Statistics", font_size = 20)
hp.display_df(data_df.describe(), 16)

## **Data Preprocessing**
Define your preprocessing steps within this section.

### **Dataset Preprocessing Information**
Use the information provided by the comparison profile report to analyze the result of your data cleaning and to help determine your preprocessing steps.

In [None]:
dataset_report = ProfileReport(data_df, title = "Dataset Profiling Report (After Handling Outliers/Missing Values)", progress_bar = False, explorative = True)
# The difference between the profiling report before and after preprocessing may not be very significant depending on the number of missing values
# and removed outliers
post_cleaning_comparison_report = dataset_report.compare(initial_dataset_report)

#post_cleaning_comparison_report
display(dataset_report)

hp.display_text(f"Numerical Variables: {numerical_variables}", font_size = 16)
hp.display_text(f"Categorical Variables: {categorical_variables}", font_size = 16)

### **Configure Preprocessing Steps for the Provided General Preprocessor**
The provided general preprocessor address three common preprocessing transformations: scaling numerical variables, one-hot encoding nominal categorical variables, and ordianal encoding ordinal categorical variables. Customize these steps to fit the needs of your dataset.

In [None]:
"""
Input a list of the numerical variables you would like to scale (defaultes to all numerical
variables in the dataset) and a list of the nominal categorical variables you would like to one-hot encode.
"""
numerical_variables_to_scale: list[str] = numerical_variables
nominal_categorical_variables_to_encode: list[str] = [""]


"""
The ordianl_categories_ordered_dict variable represents the order of ordinal categorical variable categories in a dictionary.
For the keys of this dictionary, input the column names of ordinal categorical variables. Each key's value should be a lists
of variable categories ordered from "smallest" to "largest". Example:
ordianl_categories_ordered_dict = {
    "size": ["small", "medium", "large"],
    "grade": ["F", "D", "C", "B", "A]
}
"""
ordianl_categories_ordered_dict: dict[str, list[str]] = {}

### **General Preprocessor Initialization**

In [None]:
# Extracts the keys and values of the ordianl_categories_ordered_dict into separate lists
ordianl_variable_categories_to_encode: list[str] = list(ordianl_categories_ordered_dict.keys())
ordianl_variable_categories_orders_lists: list[list[str]] = list(ordianl_categories_ordered_dict.values())

# The argumnet for the transformers parameter of ColumnTransformer must be a a list of touples with three entries. Each of these touples
# represents a preprocessing step. The first entry of each touple is a name for the step. The second entry is the transformer object, and
# the final entry is a list of the columns the step should be applied to.
general_variable_preprocessor = ColumnTransformer(
    transformers = [
        ('numerical_scaler', StandardScaler(), numerical_variables_to_scale),
        ('nominal_encoder', OneHotEncoder(drop = "first", handle_unknown = 'ignore'), nominal_categorical_variables_to_encode),
        ("ordinal_encoder", OrdinalEncoder(categories = ordianl_variable_categories_orders_lists), ordianl_variable_categories_to_encode)
    ]
)

### **Preprocessing Results**
Use the displayed information to confirm that your preprocessing steps have been applied as intended.

In [None]:
hp.display_text(f"Scaled Numerical Variables: {numerical_variables_to_scale}")
hp.display_text(f"Encoded Nominal Categorical Variables: {nominal_categorical_variables_to_encode}")
hp.display_text(f"Encoded Ordinal Categorical Variables (confirm that category orders were assigned to the correct ordinal categorical variable):")

if len(ordianl_variable_categories_to_encode) != 0:
    for i in range(len(ordianl_variable_categories_to_encode)):
        display_markdown(md(f"* {ordianl_variable_categories_to_encode[i]}: {ordianl_variable_categories_orders_lists[i]}"))
else:
    hp.display_text("None")
    
print()
hp.display_df(data_df.head())

## **K-Means**

In [None]:
"""
Configure the values of k you would like to test for the KMeans clustering algorithm (Default is 2 through 10).
"""
k_values = range(2, 11)

### **Assess K-Values**

In [None]:
def kmeans_custom_metrics(model):
    return {'inertia': model.inertia_}

kmeans_model_name: str = "KMeans"
kmeans_instances: list = [KMeans(n_clusters = k, random_state = 42) for k in k_values]
kmeans_pipelines: list = hp.create_model_pipelines(kmeans_instances, general_variable_preprocessor)
kmeans_results: dict = hp.fit_clustering_models(data_df, kmeans_pipelines, kmeans_model_name, custom_metric_func = kmeans_custom_metrics)

In [None]:
hp.plot_kmeans_analysis(kmeans_results, k_values)

### **Select a K-Value to Further Analyze**

In [None]:
"""
Set the k value for the clustering model you would like to analyze in-depth (default is 7).
"""
k = 7

In [None]:
kmeans_selected_model = kmeans_results[k]["model"]

hp.plot_cluster_sizes(kmeans_results, k)
kmeans_feature_importance_df = hp.plot_feature_importance_heatmap(kmeans_results, k, data_df)
hp.plot_feature_distributions(kmeans_results, k, data_df)
hp.plot_cluster_kde(kmeans_results, k, data_df)

### **Generate Profile Reports**

In [None]:
kmeans_cluster_reports: dict[int, tuple[ProfileReport, ProfileReport]] = hp.analyze_clusters_with_profiling(data_df, kmeans_results, k, dataset_report)

In [None]:
"""
Choose the profiling reports you would like to display
"""
cluster_5_report, cluster_5_comparison = kmeans_cluster_reports[5]
cluster_5_comparison

## **Save Best Model**

In [None]:
"""
Configure the number of clusters of the model you want to save (current k by default), the name of the file
that the model should be saved to, and the save method.
"""
num_clusters_to_save = k
save_file_name: str = ""
save_method: str = "" # Options: "pickle" or "joblib"

### **Save Model**

In [None]:
hp.save_model(save_file_name, num_clusters_to_save, kmeans_results, save_method)