<br>Table of Content:
* [Import Libraries](#1)
* [Load Data](#2)
* [Exploratory Data Analysis (EDA)](#3)
* [Data Preprocessing](#4)
    * [Data Cleaning](#4a)
    * [Data Transformation](#4b)
    * [Handling Imbalanced Data](#4c)
    * [Data Reduction](#4d)
* [Selecting and Training the Model](#5) 
* [Model Evaluation](#6) 
* [Model Optimization](#7) 
* [Model Deployment](#8) 

<a id="1"></a> <br>
## Import Libraries

In [10]:
# Data Analysis      
import pandas as pd          # data analysis library for handling structured data   
from pandas import DataFrame            
import numpy as np           # mathematical library for working with numerical data
from pandas.plotting import parallel_coordinates 
import ydata_profiling
from statsmodels.tsa.seasonal import seasonal_decompose # library for performing statistical analysis
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from typing import List
import pickle
import calendar 
import scipy.stats as stats
import datetime 
import dtale 

# Visualization
import matplotlib.pyplot as plt     # data visualization library for creating graphs and charts
%matplotlib inline
import seaborn as sns        # data visualization library based on matplotlib for creating more attractive visualizations
import plotly.io as pio
import plotly.express as px   # interactive data visualization library
import plotly.graph_objects as go   # library for creating interactive graphs and charts
import matplotlib 
import kaleido 
import missingno as msno 

# Machine Learning/Time Series 
import evalml 
import tensorflow as tf
from prophet import Prophet
from xgboost import XGBRegressor 
#ML - Preprocessing data 
from sklearn.preprocessing import FunctionTransformer, PowerTransformer #variable transformation 
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder, OneHotEncoder # Preprocessing feature scaling/categorical encoding
from sklearn.preprocessing import Normalizer, Binarizer 
from sklearn.model_selection import train_test_split    #split data into train and test 
#ML - Handling imbalanced data
from imblearn.over_sampling import RandomOverSampler 
from imblearn.under_sampling import RandomUnderSampler  
#ML - Create your model
from sklearn.linear_model import LinearRegression, LogisticRegression  #linear and logistics regression models
from sklearn.tree import DecisionTreeRegressor 
from sklearn.neighbors import KNeighborsRegressor
#ML - Evaluate model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, confusion_matrix, accuracy_score, classification_report 
from sklearn.metrics import adjusted_rand_score, v_measure_score, homogeneity_score 
#ML - Tune your model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


# Ignore warnings
import warnings
warnings.filterwarnings("ignore") 

pd.set_option('display.max_rows', 15)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

<a id="2"></a> <br>
## Load Data

In [124]:
df = pd.read_csv("Dataset/household_power_consumption.txt", sep=";",  parse_dates={'Datetime' : ['Date', 'Time']}, 
                          infer_datetime_format=True, low_memory=False, index_col='Datetime')

df.head(4)

>> Data DESCR

In [None]:
df.describe()
df.info() 

>> Visualize Dataframe

In [None]:
def visualize_df(df):       #best for time series analysis when your index is in datetime 
    """
    Creates an interactive plot of the dataframe.

    Args:
        df: A Pandas DataFrame containing energy consumption data for the PJM Interconnection region.

    Returns:
        None. Displays an interactive plot of the energy consumption data using Plotly.

    Example:
        >>> visualize_df(my_df)
    """
    import plotly.graph_objects as go

    fig = go.Figure(layout=go.Layout(
        height=500,
        width=800,
    ))

    for col in df.columns:
        fig.add_trace(go.Scatter(x=df.index, y=df[col], name=col))

    fig.update_layout(
        title={
            'text': 'PJM Energy Consumption',
            'font': {'size': 25, 'family': 'Arial', 'color': 'black'}
        },
        xaxis_title='Date',
        yaxis_title='Energy Consumption (MW)'
    )

    return fig.show(renderer='svg')

visualize_df(df) 

>> Data Profiling

In [None]:
def do_data_profiling(df, filename):
    '''
    Function to do basic data profiling
    Required Input - 
        - df = Pandas DataFrame
        - filename = Path for output file with a .html extension
    Expected Output -
        - HTML file with data profiling summary
    '''
    profile = ydata_profiling.ProfileReport(df) #replacing pandas_profiling with ydata_profiling
    profile.to_file(output_file = filename)
    print("Data profiling done")

do_data_profiling(df, data_profiling.html) 

<a id="3"></a> <br>
## Exploratory Data Analysis (EDA)

In [None]:
#visualize with individual plots 
def visualize_subplots_boxplots(df: DataFrame, columns: List[str], nrows: int, ncols: int) -> None:
    """
    Creates a grid of subplots containing boxplots of daily average energy consumption.

    Args:
        df: A Pandas DataFrame containing energy consumption data.
        columns: A list of column names to include in the boxplots.
        nrows: The number of rows in the subplot grid.
        ncols: The number of columns in the subplot grid.

    Returns:
        None. Displays a grid of subplots containing boxplots of daily average energy consumption.

    Example:
        >>> visualize_subplots_boxplots(my_df, ['Consumption', 'Generation'], 3, 4)
    """
    from typing import List
    from pandas import DataFrame
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16, 12))
    fig.suptitle('Hourly Average Energy Consumption', weight='bold', fontsize=25)

    # We just need 11 figures, so we delete the last one
    if nrows*ncols > len(columns):
        fig.delaxes(axes[nrows-1][ncols-1])

    for i, col in enumerate(columns):
        sns.boxplot(data=df, x='Hour', y=col, ax=axes.flatten()[i], color='#cc444b')

    plt.tight_layout()
    fig.savefig("Images/xxx.png", dpi=300, bbox_inches='tight')
    plt.show()
    
visualize_subplots_boxplots(df=result_2, columns=['AEP_MW', 'COMED_MW', 'DAYTON_MW', 'DEOK_MW', 'DOM_MW', 'DUQ_MW',
        'EKPC_MW', 'FE_MW', 'NI_MW', 'PJME_MW', 'PJMW_MW'], nrows=6, ncols=2)

>> Auto EDA using dtale

In [None]:
dtale.show(df) 

<a id="4"></a> <br>
## Data Pre-processing

<a id="4a"></a> <br>
### Data Cleaning

>> Basic formating (renaming cols, duplicates detection, datetime etc.)

In [None]:
# Remove duplicates 
df.drop_duplicates(inplace=True) 

#formating columns 
df.



# Rename columns
df.rename(columns={'price': 'selling_price', 'bedrooms': 'num_bedrooms'}, inplace=True)

#replace non numeric columns
def replace_non_numeric(df: pd.DataFrame, columns):
    """
    Replaces non-numeric values in the specified columns of a Pandas dataframe with NaN.

    Parameters:
        df (pd.DataFrame): The dataframe to process.
        columns (list): A list of column names to replace non-numeric values in.

    Returns:
        pd.DataFrame: The updated dataframe with non-numeric values replaced by NaN.
    """
    for col in columns:
        df.dropna(subset = col, inplace= True)
        if df[col].dtype == 'object' or df[col].dtype == 'float':
            # df.dropna(subset = col, inplace= True)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df.dropna(subset = col, inplace= True)
        else:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df.dropna(subset = col, inplace= True)
    return df

>> format datetime

In [None]:
def convert_timestamp(ts):
    """
    Converts a Unix timestamp to a formatted date and time string.

    Args:
        ts (int): The Unix timestamp to convert.

    Returns:
        str: A formatted date and time string in the format 'YYYY-MM-DD HH:MM:SS'.
    """
    utc_datetime = datetime.datetime.utcfromtimestamp(ts)
    formatted_datetime = utc_datetime.strftime('%Y-%m-%d %H:%M:%S')
    formatted_datetime = pd.to_datetime(formatted_datetime, infer_datetime_format=True) 
    return formatted_datetime

convert_timestamp(ts) 

>> Remove unwanted Data

In [None]:
# Remove irrelevant columns
df.drop(['id', 'date'], axis=1, inplace=True) 

>> Missing values

In [None]:
def missing_value_analysis(df):
    '''
    Function to do basic missing value analysis
    Required Input - 
        - df = Pandas DataFrame
    Expected Output -
        - Chart of Missing value co-occurance
        - Chart of Missing value heatmap
    '''
    msno.matrix(df)
    msno.heatmap(df)

def view_NaN(df):
    """
    Prints the name of any column in a Pandas DataFrame that contains NaN values.

    Parameters:
        - df: Pandas DataFrame

    Returns:
        - None
    """
    for col in df.columns:
        if df[col].isnull().any() == True: 
            print(f"there is {df[col].isnull().sum()} NaN present in column:", col)
        else:
            print("No NaN present in column:", col)  

missing_value_analysis (df)
view_NaN(df) 

In [None]:
# Replace missing values 
# df.fillna(df.mean(), inplace=True) 

def treat_missing_numeric(df,columns,how = 'mean', value = None):
    '''
    Function to treat missing values in numeric columns
    Required Input - 
        - df = Pandas DataFrame
        - columns = List input of all the columns need to be imputed
        - how = valid values are 'mean', 'mode', 'median','ffill', numeric value
    Expected Output -
        - Pandas dataframe with imputed missing value in mentioned columns
    '''
    if how == 'mean':
        for i in columns:
            print("Filling missing values with mean for columns - {0}".format(i))
            df[i] = df[i].fillna(df[i].mean())
            
    elif how == 'mode':
        for i in columns:
            print("Filling missing values with mode for columns - {0}".format(i))
            df[i] = df[i].fillna(df[i].mode())
    
    elif how == 'median':
        for i in columns:
            print("Filling missing values with median for columns - {0}".format(i))
            df[i] = df[i].fillna(df[i].median())
    
    elif how == 'ffill':
        for i in columns:
            print("Filling missing values with forward fill for columns - {0}".format(i))
            df[i] = df[i].fillna(method ='ffill')
    
    elif how == 'digit':
        for i in columns:
            print("Filling missing values with {0} for columns - {1}".format(how, i))
            df[i] = df[i].fillna(str(value)) 
      
    else:
        print("Missing value fill cannot be completed")
    return df.head(5)


treat_missing_numeric(smart_home, ["cloudCover"], how="digit", value = 0.1) 

In [None]:
#using Sklearn to handle missing values - (SimpleImputer, KNN-Imputer, Iterative Imputer, )

#IterativeImputer: This function estimates missing values using a predictive model.
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor 

def impute_missing_values_iteratively(X): #or (X, Columns)
    imputer = IterativeImputer(estimator = RandomForestRegressor())
        
    # select only the columns with missing values to be imputed
    # X_cols = X[columns]
    X_imputed = imputer.fit_transform(X) #or X_cols
    return X_imputed

impute_missing_values_iteratively(df) 

>> Outliers

In [None]:
#visualize outliers
def visualize_outlier (df: pd.DataFrame):
    # Select only numeric columns
    numeric_cols = df.select_dtypes(include=["float64", "int64"])
    # Set figure size and create boxplot
    fig, ax = plt.subplots(figsize=(12, 6))
    numeric_cols.boxplot(ax=ax, rot=90)
    # Set x-axis label
    ax.set_xlabel("Numeric Columns")
    # Adjust subplot spacing to prevent x-axis labels from being cut off
    plt.subplots_adjust(bottom=0.4) 
    # Increase the size of the plot
    fig.set_size_inches(10, 6)
    # Show the plot
    plt.show()

visualize_outlier (df) 

In [None]:
#Visualizing Regression outliers using Cook's distance
from yellowbrick.regressor import CooksDistance


# Instantiate and fit the visualizer
visualizer = CooksDistance()
visualizer.fit(X, y)
visualizer.show() 

In [None]:
#to handle outliers, use any of Tukey's test, Kernel density estimation, Z-score method, Mahalanobis distance method,
#Isolation Forest model, EllipticEnvelope.


# Tukey's test: This statistical method identifies outliers as values more than a certain number of standard deviations 
#     away from the median and works well for univariate datasets with normal distributions.

# Kernel density estimation: This non-parametric method estimates the probability density function of a dataset 
#     and identifies outliers as values with low probability density, making it suitable for non-normal datasets.

# Z-score method: This simple method identifies outliers as values more than a certain number of standard deviations 
#     away from the mean and is widely used for datasets with normal distributions.

# Mahalanobis distance method: This multivariate method identifies outliers based on the distance of each point from the 
#     centroid of the dataset and is effective for datasets with multivariate normal distributions.

# Isolation Forest model: This machine learning algorithm identifies outliers by isolating them into a separate tree 
#     structure, making it suitable for high-dimensional feature spaces with both linear and non-linear relationships 
#     between features.

# EllipticEnvelope: This multivariate method identifies outliers by fitting an ellipse to the data and identifying 
#     points that are outside the ellipse, making it effective for datasets with multivariate normal distributions.

<a id="4b"></a> <br>
### Data Transformation (scaling, encoding categorical data)

>> Extracting features from Dates, Mixed Variables etc.

In [None]:
df_hour = df.resample('H').mean() 
df_day = df.resample('D').mean() 
df_month = df.resample('M').mean() 
df_year = df.resample('Y').mean()


df['hour'] = df.index.hour 
df['day'] = df.index.day 
df['weekday'] = df.index.day_name() 
df['month'] = df.index.month 
df['year'] = df.index.year 

>> Categorical Variable Encoding (data transformation)

In [None]:
# Label Encoding: replaces each category with a numerical label. This technique is suitable for data 
#     where the categories have an intrinsic order, such as "low," "medium," and "high." Works well with linear models

# Ordinal encoding: assigns a numerical value to each category based on their frequency. This technique is suitable 
#     for data where the categories do not have an intrinsic order, but where their frequency may be informative.
#     Suitable for non-linear models

# One-hot encoding: creates a binary variable for each category, indicating its presence or absence. 
#     This technique is suitable for data where the categories do not have an intrinsic order and the 
#     number of categories is small



>> Data Spliting 

In [None]:
# def holdout_cv(X,y,size = 0.3, seed = 1): 
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = size, random_state = seed) 
#     X_train = X_train.reset_index(drop='index') 
#     X_test = X_test.reset_index(drop='index') 
#     return X_train, X_test, y_train, y_test 

# X_train, X_test, y_train, y_test = holdout_cv(X, y, size=0.3, seed=1) 

# #OR

In [None]:
def split_data(dataset, test_size=0.2, val_size=0.2):
    """
    Split a dataset into training, validation, and test sets.

    Parameters
    ----------
    dataset : array-like of shape (n_samples, n_features)
        The input dataset.
    test_size : float, optional
        The proportion of the dataset to include in the test set.
    val_size : float, optional
        The proportion of the dataset to include in the validation set.

    Returns
    -------
    X_train : array-like of shape (n_train_samples, n_features)
        The training input samples.
    X_val : array-like of shape (n_val_samples, n_features)
        The validation input samples.
    X_test : array-like of shape (n_test_samples, n_features)
        The test input samples.
    y_train : array-like of shape (n_train_samples,)
        The target values (class labels) for the training input samples.
    y_val : array-like of shape (n_val_samples,)
        The target values (class labels) for the validation input samples.
    y_test : array-like of shape (n_test_samples,)
        The target values (class labels) for the test input samples.
    """
    # Split the dataset into train and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(dataset.data, dataset.target, test_size=test_size, random_state=42)

    # Split the train set into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=val_size/(1-test_size), random_state=42)

    #show the shapes
    print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)
    
    return X_train, X_val, X_test, y_train, y_val, y_test


# Split the dataset into training, validation, and test sets
X_train, X_val, X_test, y_train, y_val, y_test = split_data(df) 


>> Feature Scaling (data transformation) - apply to train, and then to test

In [None]:
#use any of standard scaler (z-score), min-max scaler 

#Z-score standardized feature scaling (most common)
#min-max scaling (common) 

def standardize_data(X_train, X_test): 
    """
    Standardizes the training and testing data using the mean and standard deviation
    learned from the training set.
    
    Args:
    - X_train: numpy array or pandas dataframe, training data
    - X_test: numpy array or pandas dataframe, testing data
    
    Returns:
    - X_train_scaled: numpy array or pandas dataframe, standardized training data
    - X_test_scaled: numpy array or pandas dataframe, standardized testing data
    """
    from sklearn.preprocessing import StandardScaler 
    # Set up the scaler
    scaler = StandardScaler()
    
    # Fit the scaler to the training set
    scaler.fit(X_train) 
    
    # Transform the training and testing sets
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled

standardize_data(X_test, X_test) 

>> Variable Transformation - apply to train, and then to test

In [None]:
#Variable transformation involves transforming the values of variables to make them more suitable for analysis
#the idea is to make the variables normally/gaussian distributed. Hence, 

#first step is to assess normality using a histogram or QQ-plot (to explore the variable distribution)

def diagnostic_plots(df, variable):

    # function to plot a histogram and a Q-Q plot
    # side by side, for a certain variable

    plt.figure(figsize=(15, 6))

    # histogram
    plt.subplot(1, 2, 1)
    df[variable].hist(bins=30)
    plt.title(f"Histogram of {variable}")

    # q-q plot
    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.title(f"Q-Q plot of {variable}")

    # check for skewness
    skewness = df[variable].skew()
    if skewness > 0:
        skew_type = "positively skewed"
    elif skewness < 0:
        skew_type = "negatively skewed"
    else:
        skew_type = "approximately symmetric"
        
    # print message indicating skewness type
    print(f"The variable {variable} is {skew_type} (skewness = {skewness:.2f})")
    
    plt.show()

# Check function output
diagnostic_plots(X, "MedInc")

In [None]:
#If the variables are NOT normally distributed, we then transform it. It is necessary to test several variable 
# transformation methods, and choose the best for that feature. One variable transformation method is log_transform

#log transform 
def log_transform(df, columns):
     """
    Transforms specified columns of a pandas DataFrame using the natural logarithm function.

    Parameters:
    -----------
    df : pandas DataFrame
        The DataFrame to transform.
    columns : list
        A list of column names to transform.

    Returns:
    --------
    pandas DataFrame
        The transformed DataFrame.
    """
    transformer = FunctionTransformer(np.log1p, validate=True)
    X = df.values.copy()
    X[:, df.columns.isin(columns)] = transformer.transform(X[:, df.columns.isin(columns)])
    X_log = pd.DataFrame(X, index=df.index, columns=df.columns)
    return X_log

df_log = log_transform(df, columns)

diagnostic_plots(df_log, columns) 

>> Discretization (data transformation - apply to train, and then to test)

In [None]:
# Discretization in machine learning is the process of transforming continuous variables into discrete or 
# categorical variables. This process involves dividing the range of a continuous variable into a finite number of 
# intervals or bins, and then assigning each observation to a particular bin based on the value of the continuous 
# variable. 

#Discretization approaches: equal width, equal frequency, K means, Decision Trees



<a id="4c"></a> <br>
### Handling Imbalanced Data and Biases (Class Imbalance)

In [None]:
# apply to train, and then to test 

#Imbalanced data refers to a situation where the number of observations in one class or category is much larger 
# or smaller than the number of observations in other classes or categories. Imbalanced data can pose challenges 
# in machine learning because it can lead to biased models that perform poorly on the minority class.

#we typically look at the 'target data' when checking for imbalanced data. However, it is also important 
# to consider the features

def check_imbalance(dataset, columns=None, threshold=10):
    """
    This function takes a dataset and one or more columns as input and returns True if any of the specified columns
    are imbalanced, False otherwise. A column is considered imbalanced if the percentage of the minority class is less
    than the specified threshold.
    """
    # If no columns are specified, use all columns except for the last one as the features
    if columns is None:
        features = dataset.iloc[:, :-1]
        columns = features.columns
    
    # Check the imbalance of each specified column
    for col in columns:
        # Get the counts of each class in the column
        class_counts = dataset[col].value_counts()

        # Calculate the percentage of each class in the column
        class_percentages = class_counts / len(dataset) * 100

        # Plot the class percentages
        plt.bar(class_counts.index, class_percentages)
        plt.xlabel(col)
        plt.ylabel('Percentage')
        plt.title(f'{col} Distribution')
        plt.show()

        # Check if the column is imbalanced
        minority_class = class_counts.index[-1]
        minority_class_percentage = class_percentages.iloc[-1]
        if minority_class_percentage < threshold:
            print(f'{col} is imbalanced. Minority class: {minority_class}, Percentage: {minority_class_percentage:.2f}%')
            return True

    # If none of the specified columns are imbalanced, return False
    print('No imbalance found.')
    return False

check_imbalance(df, columns=['target'], threshold=10) 


#OR

from yellowbrick.target import ClassBalance 
# Instantiate the visualizer
visualizer = ClassBalance(labels=["draw", "loss", "win"])
visualizer.fit(y_train, y_test)        # Fit the data to the visualizer (you can also use visualizer.fit(y))
visualizer.show()                       # Finalize and render the figure


In [None]:
#if there is imbalance, you can handle it by over-sampling or under-sampling the dataset

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

def handle_imbalanced_data(X, y, strategy='over-sampling'):
    """
    Handle imbalanced data using imblearn library.
    
    Parameters:
    -----------
    X: array-like of shape (n_samples, n_features)
        The input data.
    y: array-like of shape (n_samples,)
        The target values.
    strategy: str, default='over-sampling'
        The strategy to use for handling imbalanced data. Possible values are
        'over-sampling' and 'under-sampling'.
        
    Returns:
    --------
    X_resampled: array-like of shape (n_samples_new, n_features)
        The resampled input data.
    y_resampled: array-like of shape (n_samples_new,)
        The resampled target values.
    """
    if strategy == 'over-sampling':
        # Initialize the RandomOverSampler object
        ros = RandomOverSampler(sampling_strategy='minority', random_state=0)
        # Resample the data
        X_resampled, y_resampled = ros.fit_resample(X, y)
    elif strategy == 'under-sampling':
        # Initialize the RandomUnderSampler object
        rus = RandomUnderSampler(sampling_strategy='majority', random_state=0)
        # Resample the data
        X_resampled, y_resampled = rus.fit_resample(X, y)
    else:
        raise ValueError("Invalid strategy. Possible values are 'over-sampling' and 'under-sampling'.")
    
    return X_resampled, y_resampled

<a id="4d"></a> <br>
### Data Reduction

<a id="5"></a> <br>
## Selecting and Training the Model 

In [None]:
# Before you choose a ML technique to train the model, consider if your ML technique is resistant to the following:
#     Missing data
#     Data imbalance
#     Feature Scaling
#     Categorical Data
#     Outliers
#     Dimensionality (refers to the number of features or variables in the dataset)

>> Use Pycaret and EvalMl to test multiple ML methods (AutoML)

In [49]:
#pycaret

from pycaret.classification import setup, compare_models, create_model, tune_model, plot_model, evaluate_model

X, y = load_iris(return_X_y=True, as_frame=True) 
X['target'] = y 
# Initialize classification setup 
# clf1 = setup(data=X, target='target', train_size = 0.8, 
#              preprocess = True, polynomial_features = True, 
#              polynomial_degree = 2, fix_imbalance = True,
#              fix_imbalance_method = 'SMOTE', feature_selection = True,
#              feature_selection_method = ' ', feature_selection_estimator = ,
#              n_features_to_select = 0.2) 


clf1 = setup(data=X, target='target', train_size = 0.8)

# Compare models 
compare_results = compare_models(n_select=5)    #the best 5 models will be highlighted

# Create a model
model = create_model('knn')     #change knn to any of the top 5 models from above

# # Tune the model
tuned_model = tune_model(model)

# # Evaluate the model
evaluate_model(tuned_model)

# # Fit the model
final_model = tune_model(tuned_model)


In [None]:
#EvalML (AutoML using EvalML doesn't just give you the best model, it also gives the best pipeline)
import evalml

X, y = evalml.demos.load_breast_cancer()
X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, problem_type='binary') #change the problem type
                    #to see all problem types, use (evalml.problem_types.ProblemTypes.all_problem_types)

#Use EvalML's AutoML to perform the following steps:

#Step 1: search multiple ML methods and parameters
from evalml.automl import AutoMLSearch
automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type='binary')  #also change the problem type
automl.search()

#Step 2: Rank each of the multiple ML algorithms to see their parameters and then choose the best
automl.rankings
# automl.describe_pipeline(automl.rankings.iloc[0]["id"]) #use this to describe each of the model/pipeline. change 0 to other values

#Step 3: Choose the best pipeline
best_pipeline=automl.best_pipeline
best_pipeline

#Step 4: You can evaluate other objective functions, or optimize the model for a specific objective
best_pipeline.score(X_test, y_test, objectives=["auc","f1","Precision","Recall"]) #evaluate other objective functions
automl_auc = AutoMLSearch(X_train=X_train, y_train=y_train,             #optimize step 1 for a specific objective
                          problem_type='binary',
                          objective='auc',
                          additional_objectives=['f1', 'precision'],
                          max_batches=1,
                          optimize_thresholds=True)

automl_auc.search()

#Step 5: Make predictions, save and load the model
best_pipeline.predict_proba(X_test).to_dataframe()
best_pipeline.save("model.pkl")
check_model=automl.load('model.pkl')


>> Then select the preferred model to use

In [None]:
# Train a random forest classifier on the dataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification


# # Generate a synthetic dataset with 1000 samples, 20 features, and 2 classes
# X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, weights=[1,1], random_state=1)

### Running Random Forest
def runRF(
    train_X,
    train_y,
    test_X,
    test_y=None,
    test_X2=None,
    rounds=100,
    depth=20,
    leaf=10,
    feat=0.2,
    min_data_split_val=2,
    seed_val=0,
    job=-1,
):
    model = RandomForestClassifier(
        n_estimators=rounds,
        max_depth=depth,
        min_samples_split=min_data_split_val,
        min_samples_leaf=leaf,
        max_features=feat,
        n_jobs=job,
        random_state=seed_val,
    )
    model.fit(train_X, train_y)
    train_preds = model.predict_proba(train_X)[:, 1]
    test_preds = model.predict_proba(test_X)[:, 1]

    test_preds2 = 0
    if test_X2 is not None:
        test_preds2 = model.predict_proba(test_X2)[:, 1]

    test_loss = 0
    if test_y is not None:
        train_loss = roc_auc_score(train_y, train_preds)
        test_loss = roc_auc_score(test_y, test_preds)
        print("Train and Test loss : ", train_loss, test_loss)
    return test_preds, test_loss, test_preds2, model

y_pred, test_loss, test_preds2, clf = runRF(
                                            X_train,
                                            y_train,
                                            X_val,
                                            y_val=None,
                                            test_X2=None,
                                            rounds=100,
                                            depth=20,
                                            leaf=10,
                                            feat=0.2,
                                            min_data_split_val=2,
                                            seed_val=0,
                                            job=-1,
                                        )


>> Compare predicted values with the actual test values

In [None]:
def plot_predictions(y_pred, y_test): 
    """
    Plots the predicted and actual values on separate scatter plots.
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
    
    # Plot the actual values
    ax1.scatter(range(len(y_test)), y_test, label='Actual Values')
    ax1.set_xlabel('Index')
    ax1.set_ylabel('Actual Values')
    ax1.set_title('Scatter plot of Actual Values')
    ax1.legend()
    
    # Plot the predicted values
    ax2.scatter(range(len(y_pred)), y_pred, label='Predicted Values')
    ax2.set_xlabel('Index')
    ax2.set_ylabel('Predicted Values')
    ax2.set_title('Scatter plot of Predicted Values')
    ax2.legend()
    
    # Show the plots
    plt.show()

plot_predictions(y_pred, y_val)

>> Class Prediction Error

In [None]:
from yellowbrick.classifier import ClassPredictionError

# classes = ["apple", "kiwi", "pear", "banana", "orange"]

# Instantiate the classification model and visualizer
visualizer = ClassPredictionError(model, classes=classes)

# Fit the training data to the visualizer
visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
visualizer.score(X_test, y_test)

# Draw visualization
visualizer.show()

<a id="6"></a> <br>
## Model Evaluation

In [None]:
#Model evaluation using scoring parameter
from sklearn.model_selection import cross_val_score


print(f' Score: {clf.score(X_val, y_val)}')   #R^2 for Regression; accuracy for classification. similar to cross_val_score


In [None]:
from sklearn.metrics import accuracy_score

def accuracy(model, data, labels):
    
    predictions = model.predict(data)
    acc = accuracy_score(labels, predictions)
    
    return acc

In [None]:
#Regression
    # Error = Actual value - Predicted value

    # MSE (Mean Square Error)
        # The square of the error over all samples is called Mean Squarred Error(MSE).
        # MSE = SQUARE(Actual value - Predicted value)/Number of Samples
    #RMSE (Root Mean Square Error)
    # MAE (Mean Absolute Error)
        # MAE = ABSOLUTE (Actual value - Predicted Value)


#Classification
    #Accuracy
    #Precision
    #Recall
    #F1 score 
    #confusion matrix
    #AUC ROC curve

# True Positives(TP): Number of samples that are correctly classified as positive, and their actual label is positive.

# False Positives (FP): Number of samples that are incorrectly classified as positive, when in fact their actual label 
#     is negative.

# True Negatives (TN): Number of samples that are correctly classified as negative, and their actual label is negative.

# False Negatives (FN): Number of samples that are incorrectly classified as negative, when in fact their actual label 
#     is positive.


>> Mean Square Error (MSE), Root Mean Square Error (RMSE), Mean Absolute Error (MAE), R-squared

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def predict(X_test, model, y_test):
    """
    Take the input data, model and labels and return predictions and evaluation metrics
    """
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    rmse = np.sqrt(mse)

    print("Mean Squared Error: ", mse)
    print("Root Mean Squared Error: ", rmse)
    print("Mean Absolute Error: ", mae)
    print("R-squared: ", r2)
    
    return preds, mse, rmse, mae, r2


predict(X_test, model, y_test) 

>> Confusion Matrix, Precision, Recall, Accuracy, F1-Score, AUC ROC Curve

In [None]:
#confusion matrix

accuracy = accuracy_score(y_test, y_pred)
class_names = digits.target_names

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function plots a confusion matrix.
    """
    cm = confusion_matrix(y_true, y_pred)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

plot_confusion_matrix(y_test, y_pred, classes=class_names,
                      title='Confusion matrix, Accuracy = {:.2f}'.format(accuracy))

In [None]:
#AUC ROC curve (use this for binary classification)

def plot_roc(y_actual, y_pred):
    """
    Function to plot AUC-ROC curve
    """
    fpr, tpr, thresholds = roc_curve(y_actual, y_pred)
    plt.plot(
        fpr,
        tpr,
        color="b",
        label=r"Model (AUC = %0.2f)" % (roc_auc_score(y_actual, y_pred)),
        lw=2,
        alpha=0.8,
    )
    plt.plot(
        [0, 1],
        [0, 1],
        linestyle="--",
        lw=2,
        color="r",
        label="Luck (AUC = 0.5)",
        alpha=0.8,
    )
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver operating characteristic example")
    plt.legend(loc="lower right")
    plt.show()

    #To choose the threshold value that maximizes the Youden's J statistic
    # calculate Youden's J statistic for each threshold value
    J = tpr - fpr
    best_threshold = thresholds[np.argmax(J)]
    print('Best threshold:', best_threshold)
    
plot_roc(y_actual, y_pred) 




#OR
from yellowbrick.classifier import ROCAUC #yellow brick can be used for multiclass classification

visualizer = ROCAUC(model, classes=["win", "loss", "draw"])
visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and render the figure

# roc_auc(model, X_train, y_train, X_test=X_test, y_test=y_test, classes=['not_defaulted', 'defaulted']) #quick_method

In [None]:
#Classification Report (Precision, Recall, F1-Score)
from sklearn.metrics import classification_report
from yellowbrick.classifier import classification_report 


print(classification_report(y_test, y_pred))

# precision measures how many of the positive predictions made by the model are actually correct. 
# A high precision score indicates that the model is making very few false positive predictions.

# recall measures how many of the actual positive instances in the dataset are correctly predicted as positive 
# by the model. A high recall score indicates that the model is correctly identifying a large proportion of the 
# positive instances in the dataset.

#F1- Score is used to compare precision/recall numbers


#OR
# Instantiate the visualizer
visualizer = classification_report(
    model, X_train, y_train, X_test, y_test, classes=classes, support=True
)


In [None]:
#Precision - Recall

def plot_precisionrecall(y_actual, y_pred):
    """
    Function to plot AUC-ROC curve
    """
    average_precision = average_precision_score(y_actual, y_pred)
    precision, recall, _ = precision_recall_curve(y_actual, y_pred)
    # In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
    step_kwargs = (
        {"step": "post"} if "step" in signature(plt.fill_between).parameters else {}
    )

    plt.figure(figsize=(9, 6))
    plt.step(recall, precision, color="b", alpha=0.2, where="post")
    plt.fill_between(recall, precision, alpha=0.2, color="b", **step_kwargs)

    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title("Precision-Recall curve: AP={0:0.2f}".format(average_precision))

plot_precisionrecall(y_actual, y_pred)

#from the plot, we can pick a trade-off threshold where both precision and recall are high



#OR
from yellowbrick.classifier import PrecisionRecallCurve
# Create the visualizer, fit, score, and show it
viz = PrecisionRecallCurve(model, per_class=True,
                            cmap="Set1", iso_f1_curves=True, 
                            micro=False)
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()

>> Access the Bias and Variance of the Model (to check for underfitting or overfitting) - Diagnostics

In [None]:
from sklearn.model_selection import learning_curve, cross_val_score, KFold, train_test_split
from yellowbrick.model_selection import CVScores #visualizing the cross validation scores

#check Bias and Variance using Cross Validation

cv = 5  #or # Create a cross-validation object: cv = KFold(n_splits=5, shuffle=True, random_state=42) 

def cv_bias_variance(model, X, y, cv):
    scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error')   
    train_error = -scores.mean()
    val_error = -scores.std()
    return train_error, val_error, scores

# options to replace scoring:
#     regression: r2, neg_mean_absolute_error, explained_variance, neg_root_mean_squared_error, etc.
#     classification: accuracy, f1, roc_auc, precision, recall, etc.  


# Calculate the mean training and validation error scores
train_error, val_error, scores = cv_bias_variance(model, X, y, cv)
print("Mean training error:", train_error)
print("Mean validation error:", val_error)


visualizer = CVScores(model, cv=cv, scoring='r2')

visualizer.fit(X, y)        # Fit the data to the visualizer
visualizer.show()           # Finalize and render the figure




In [None]:
#check Bias and Variance using Learnng Curve

train_sizes = np.linspace(0.1, 1.0, 10) # Define the training set sizes to plot the learning curve

def cv_learning_curve(model, X, y, cv, train_sizes):
    train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=cv, n_jobs=-1, train_sizes=train_sizes, scoring='neg_mean_squared_error')
                                                #scoring parameter -  #https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 
    train_mean = np.mean(-train_scores, axis=1)
    train_std = np.std(-train_scores, axis=1)
    test_mean = np.mean(-test_scores, axis=1)
    test_std = np.std(-test_scores, axis=1)
    
    plt.figure(figsize=(10,6))
    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training Error')
    plt.plot(train_sizes, test_mean, 'o-', color='green', label='Validation Error')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='green')
    plt.xlabel('Number of Training Examples')
    plt.ylabel('Mean Squared Error')
    plt.title('Learning Curve')
    plt.legend(loc='best')
    plt.show()
    
    return train_sizes, train_mean, train_std, test_mean, test_std



# Generate learning curve plot
train_sizes, train_mean, train_std, test_mean, test_std = cv_learning_curve(model, X, y, cv, train_sizes)

# If the training and validation errors are both high and close to each other, it indicates that the model is underfitting 
# and has high bias. If the training error is low, but the validation error is high and there is a large gap between them, 
# it indicates that the model is overfitting and has high variance.

# Jtrain -> Jcv -> high and close to each other (high bias) - Jcv and Jtrain flatten out after a certain no of training samples, so getting more data would not likely help
# Jcv >> Jtrain (high variance) - 
# Jcv >> Jtrain -> high (high bias and high v_measure_score)

# To fix high bias:
#     get additional features or increase the model size. First perform a feature matrix before adding additional features
#     try adding polynomial features (feature engineering: create new features or transform existing features)
#     decrease the regularization parameter (lambda)

# To fix high variance:
#     get more training samples (more data)
#     simplify the data by reducing the number of features
#     Increase the regularization parameter (lambda)


>> Analyze Error Distribution

In [None]:
# if the errors are normally distributed around zero, it may indicate that the model is making unbiased predictions. 
# If there is a pattern or trend in the errors, it may suggest that the model has systematic biases or is making 
# consistent errors in certain regions of the input space



def analyze_error_distribution(y_true, y_pred):
    """
    Function to analyze the error distribution by plotting histograms and scatter plots.

    Parameters:
    -----------
    y_true : array-like
        Array of true labels or ground truth.
    y_pred : array-like
        Array of predicted values.

    Returns:
    --------
    None
    """
    # Calculate errors
    errors = y_true - y_pred

    # Plot histogram of errors
    plt.figure(figsize=(8, 6))
    plt.hist(errors, bins=20, alpha=0.75)
    plt.xlabel('Error')
    plt.ylabel('Frequency')
    plt.title('Error Distribution (Histogram)')
    plt.grid(True)
    plt.show()

    # Plot scatter plot of true labels vs. errors
    plt.figure(figsize=(8, 6))
    plt.scatter(y_true, errors, alpha=0.75)
    plt.xlabel('True Labels')
    plt.ylabel('Error')
    plt.title('Error Distribution (Scatter Plot)')
    plt.grid(True)
    plt.show()

    # Plot scatter plot of predicted values vs. errors
    plt.figure(figsize=(8, 6))
    plt.scatter(y_pred, errors, alpha=0.75)
    plt.xlabel('Predicted Values')
    plt.ylabel('Error')
    plt.title('Error Distribution (Scatter Plot)')
    plt.grid(True)
    plt.show()


analyze_error_distribution(y_val, y_pred)

>> Error Analysis - Diagnostics

In [None]:
#Error analysis is the process of analyzing the errors made by a machine learning model and identifying the patterns 
# or trends that may be causing the errors. The goal of error analysis is to gain insight into the behavior of the 
# model and identify areas for improvement.

# The steps involved in error analysis:
    # Collect error data
    # Categorize errors
    # Identify patterns
    # Analyze causes
    # Prioritize fixes
    
# Based on the insights gained from the error analysis, you can perform the following.
# False negatives:
# False negatives occur when the model predicts that a customer will not churn when they actually do churn. 
# To fix this issue, you may consider the following:
#     Increase the weight of the features that are more indicative of churn for low-usage customers, 
#         such as frequency of usage or specific product usage. (adjust the model parameters)
#     Add new features that may be predictive of churn, such as customer sentiment or customer service interactions.
#     Use a different model architecture that is better suited for handling imbalanced data, such as a decision tree 
#         or ensemble model.
# False positives:
# False positives occur when the model predicts that a customer will churn when they actually do not churn. 
# To fix this issue, you may consider the following:
#     Decrease the weight of features that are causing false positives, such as age or income, if they are not as 
#         indicative of churn for low-usage customers. (adjust the model parameters)
#     Remove features that are causing false positives altogether, if they are not providing significant value to the 
#         model.
#     Increase the size of the training dataset to capture a more representative sample of customers who do not churn, 
#         which may help the model learn more accurately which customers are likely to churn.


In [None]:
#Plot confusion matrix to visualize false positives and false negatives
    #By default, scikit-learn will assume that the "positive" class is the last label (or highest label value) 
    # in the list of labels. [0, 1] where 1 is Positive and is the class_of_interest.


class_names = [0, 1] #or iris().target_names #this is an example and should be edited. [0, 1] for binary classification
class_of_interest = 1 #this selects a specific class of interest other than 1 or the highest value. 
                        #always select the highest one because that is what Scikit learn uses. 

def false_positives(X_test, y_true, y_pred, classes):
    """ 
    This function identifies and plots the false positives in a classification problem. 
    """ 
    fp_indices = np.where((y_true != class_of_interest) & (y_pred == class_of_interest))[0] 
    fp_features = X_test[fp_indices] # assuming X_test is a numpy array of input data 
    # fp_features = X_test.iloc[fp_indices]
    fp_labels = y_pred[fp_indices] # assuming y_pred is a numpy array of predicted labels 
    # fp_labels = pd.Series(y_pred).iloc[fp_indices]

    print("False positives: ", len(fp_indices))
    return fp_features, fp_labels


#false negatives 
def false_negatives(X_test, y_true, y_pred, classes):
    """ 
    This function identifies and plots the false negatives in a classification problem. 
    """ 
    fn_indices = np.where((y_true == class_of_interest) & (y_pred != class_of_interest))[0] 
    fn_features = X_test[fn_indices] # assuming X_test is a numpy array of input data
    # fn_features = X_test.iloc[fn_indices] 
    fn_labels = y_pred[fn_indices] # assuming y_pred is a numpy array of predicted labels 
    # fn_labels = pd.Series(y_pred).iloc[fn_indices]

    print("False negatives: ", len(fn_indices))
    return fn_features, fn_labels


# Plot the confusion matrix to evaluate the performance of the model
plot_confusion_matrix(y_test, y_pred, classes=classes,
                      title='Confusion matrix, Accuracy = {:.2f}'.format(accuracy))

# Identify and plot the false positives
X_fp, y_fp = false_positives(X_test, y_test, y_pred, class_names)

# Identify and plot the false negatives
X_fn, y_fn = false_negatives(X_test, y_test, y_pred, class_names)

>> Feature Selection/Extraction

In [None]:
#access the most important features in the model

#depending on the results from the bias and variance tests, there may be need to assess which features
# are the most important in the ML model


def feature_importance(model,X):
    feature_importance = model.feature_importances_
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.figure(figsize=(15, 15))
    plt.subplot(1, 2, 2)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, X.columns[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()

feature_importance(model,X_train)  



#OR
from yellowbrick.model_selection import FeatureImportances

viz = FeatureImportances(model, labels=labels, relative=False)
viz.fit(X, y)
viz.show()

In [None]:
# Feature Selection/Extraction (to fix high bias and high variance) - (apply to train, and then to test)

# having accessed the feature importance, you may then Add or Remove features, or make polynomial features

#Step 1: Manually adding or removing features

#AND/OR

#Step 2: Performing any of the following: VarianceThreshold, SelectKBest, Principal COmponent Analysis, 
# Independent Component Analysis (ICA), t-distributed Stochastic Neighbor Embedding (t-SNE)
# some common ones are these two transformers: PCA for feature extraction and SelectKBest for feature selection




In [None]:
#Adding Polynomial Features (to Fix High Variance) - do this only if there is high variance

from sklearn.preprocessing import PolynomialFeatures

def add_polynomial_features_sklearn(df, degree, columns=None):
    """
    Adds polynomial features up to a specified degree to a subset of columns in a Pandas DataFrame using Scikit-Learn's PolynomialFeatures.
    
    Parameters:
        df (Pandas DataFrame): The DataFrame to which the polynomial features will be added.
        degree (int): The maximum degree of polynomial features to add.
        columns (list of str): The names of the columns to which polynomial features will be added. If None, all columns will be used.
        
    Returns:
        Pandas DataFrame: A new DataFrame with the original columns and polynomial features up to the specified degree.
    """
    
    # Select the columns to which polynomial features will be added
    if columns is None:
        columns = df.columns
    df_subset = df[columns]
    
    # Create a copy of the original DataFrame to avoid modifying it
    new_df = df.copy()
    
    # Create a PolynomialFeatures transformer
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    
    # Transform the subset of the DataFrame with polynomial features
    poly_df = poly.fit_transform(df_subset)
    
    # Create column names for the new DataFrame
    col_names = poly.get_feature_names(df_subset.columns)
    
    # Create a new DataFrame with the polynomial features
    poly_df = pd.DataFrame(poly_df, columns=col_names, index=df_subset.index)
    
    # Merge the original DataFrame with the new DataFrame
    new_df = pd.concat([new_df, poly_df], axis=1)
    
    return new_df 

df_polynomial = add_polynomial_features_sklearn(df, degree, columns=None)


>> Performing Reglarization

In [None]:
#Another way to fix High Bias or Variance is to perform regularization on the model.
#this would involve increasing or decreasing the regularization parameter (lambda) to fix high variance or bias


# By tuning the hyperparameters of the model using cross-validation, 
# we would have effectively applied regularization to the model, which can help to reduce overfitting and improve 
# its generalization performance.

#Hence the next step is MODEL OPTIMIZATION. 

<a id="7"></a> <br>
## Model Optimization

In [None]:
#Steps to Model Optimization: 

#1.  Define the objective:
#     The objective of this example is to classify the iris flowers into three species (setosa, versicolor, and virginica) 
#     based on the four features (sepal length, sepal width, petal length, and petal width).

#2.  Choose hyperparameters to optimize:
    # In this example, we will optimize the hyperparameters of a Random Forest Classifier. We will tune the number of 
    # estimators, maximum depth of the tree, and minimum number of samples required to split a node.

#3.  Choose an optimization algorithm:
#     We will use Random Search to optimize the hyperparameters. Random Search randomly selects combinations of 
#     hyperparameters from a pre-defined search space.


In [398]:
# model.get_params()  #to get the parameters of the models in order to improve it

In [70]:
from sklearn.ensemble import RandomForestClassifier

RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [None]:
#5.  Train and evaluate the model:
    #     We will train the model using the training set and evaluate it on the validation set. We will use 
    #     RandomizedSearchCV to perform the random search.

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV 
import numpy as np

rfc = RandomForestClassifier(random_state=42)

# rfc.get_params()  #to get the parameters of the models in order to improve it

# Define the search space (i.e., the hyperparameters to be tunes). This would vary depending on the model being used
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

# Perform random search
rfc_random = RandomizedSearchCV(estimator=rfc, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

rfc_random.fit(X_train, y_train)

# Print the best hyperparameters
print(rfc_random.best_params_)


In [None]:
#After performing RandomSearchCV, it can be a good idea to perform GridSearchCV to get the best estimators
#we can then streamline the results as use it as an input for the GridSearchCV

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

rfc_grid = GridSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'), random_grid, verbose=2, cv=3, refit=True)

rfc_grid.fit(X_train, y_train)
print(rfc_grid.best_params_) # Print the best hyperparameters


In [400]:
best_params = rfc_grid.best_estimator_

print(best_params)

RandomForestClassifier(max_depth=70, min_samples_leaf=2, min_samples_split=5,
                       n_estimators=1800, random_state=42)


In [385]:
# The best hyperparameters found by RandomizedSearchCV are:
#     n_estimators=1800, 
#     min_samples_split=5, 
#     min_samples_leaf=1, and 
#     max_depth=30.


# Print the best hyperparameters
print(rfc_grid.best_params_) 

{'n_estimators': 1800, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 70}


In [None]:
#6.  Fine-tune the model:
#        We will use the best hyperparameters found by RandomizedSearchCV to fine-tune the model.

rfc = RandomForestClassifier(n_estimators=1800,
                              min_samples_split=5,
                              min_samples_leaf=1,
                              max_depth=30,
                              random_state=42)

rfc.fit(X_train, y_train)

# Evaluate the model on the validation set
from sklearn.metrics import accuracy_score

y_pred = rfc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")



In [None]:
#7.   Test the Model
  #     Finally, after optimizing and fine-tuning the model, we can use it to make predictions on new, unseen data

# Use the optimized model to make predictions on the new dataset
y_pred_new = rfc.predict(X_test)

# Print the predicted class labels and the actual class labels of the new dataset
print("Predicted class labels:", y_pred_new)
print("Actual class labels:   ", y_test) 

<a id="8"></a> <br>
## Model Deployment 

>> Save the ML model

In [None]:
import joblib

def save_model(model, filename):
    """
    Save a trained scikit-learn model to disk using joblib.
    """
    try:
        joblib.dump(model, filename)
        print(f"Model saved to {filename}")
    except Exception as e:
        print(f"Error saving model to {filename}: {e}")

save_model(model, 'model.joblib')

In [None]:
# #OR (save the model along with the preprcessor) - check the bottom to find out how preprocessor is used. 

# def save_model(model, preprocessor, filename):
#     """
#     Save a trained scikit-learn model and its preprocessor to disk using joblib.
#     """
#     try:
#         joblib.dump((model, preprocessor), filename)
#         print(f"Model saved to {filename}")
#     except Exception as e:
#         print(f"Error saving model to {filename}: {e}")


>> Deploy the Model

In [None]:
from flask import Flask, jsonify, request

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    # Load the trained model from disk
    model = joblib.load('model.joblib')

    # Get the request data and convert to numpy array
    request_data = request.get_json()
    input_data = np.array(request_data['data'])

    # Make predictions using the trained model
    predictions = model.predict(input_data)

    # Return the predictions as a JSON response
    response = {'predictions': predictions.tolist()}
    return jsonify(response)

if __name__ == '__main__':
    app.run(debug=True)


In [None]:
#deploy the model here

# https://dashboard.render.com/
# sign-in with Github
#or use Heroku, GCP, AWS, AZUre, IBM Watson etc.

In [None]:
# # Train a scikit-learn model with preprocessor
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.datasets import load_iris

# iris = load_iris()
# X, y = iris.data, iris.target

# numeric_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])

# categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder())
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, [0, 1]),
#         ('cat', categorical_transformer, [2])
#     ])

# model = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression())
# ])

# model.fit(X, y)

# # Save the trained model and preprocessor to disk
# save_model(model, preprocessor, "model.joblib")


>>Tips

In [None]:
# Generate a synthetic dataset with 1000 samples, 20 features, and 2 classes
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, weights=[1,1], random_state=1)
feature_names = ['Feature_{}'.format(i) for i in range(X.shape[1])] # Create feature names


#you can use both RandomGridSearch and GridSearch to optimize a model. Use RandomGridSearch first, then GridSearch
rfc_random = RandomizedSearchCV(estimator=rfc, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
rfc_random = GridSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'), random_grid, verbose=2, cv=3, refit=True)
        # estimator: This is the machine learning model or estimator that you want to optimize using hyperparameter tuning
        # param_distributions: This parameter specifies the hyperparameter space to be searched during the random search
        # n_iter: This specifies the number of random combinations of hyperparameter values to try during the search.
        # cv: This parameter determines the number of folds in the cross-validation process.
        # verbose: This controls the verbosity of the output during the hyperparameter search. 
        #     A higher value, such as verbose=2, means more detailed output will be displayed during the search.
        # random_state: This parameter sets the random seed for reproducibility
        # n_jobs: This specifies the number of CPU cores to use for parallelization during the hyperparameter search. 
        #     A value of -1 (n_jobs=-1) means that all available CPU cores will be used.
        # scoring: This parameter specifies the scoring metric used to evaluate the performance of the model 
        #     with different hyperparameter values. It can be set to a string representing a scoring metric, 
        #     such as 'accuracy', 'precision', 'recall', 'f1', etc., or an object of a custom scoring function
        # refit: This parameter determines whether the best hyperparameters found during the search should be used to 
        #     refit the model on the entire dataset after the search is complete.
        
# Cross-validation techniques (three of the most common):
#     K-fold Cross-Validation: It divides the data into k equally sized folds and performs training and testing on 
#         k iterations. It is widely used due to its simplicity and provides a good balance between bias and variance.
        from sklearn.model_selection import KFold
        kf = KFold(n_splits=5, shuffle=True, random_state=42)

#     Stratified K-fold Cross-Validation: It is similar to K-fold cross-validation, but it ensures that each fold has an 
#         approximately equal distribution of target classes, making it suitable for imbalanced datasets.
        from sklearn.model_selection import StratifiedKFold
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#     Time Series Cross-Validation: It is used for time series data, where the order of data points matters. 
#         It involves using a sliding time window to create overlapping train and test sets, taking into account 
#         temporal dependencies.
        from sklearn.model_selection import TimeSeriesSplit
        tscv = TimeSeriesSplit(n_splits=5)

#      Leave-One-Out Cross-Validation (LOOCV): It is a special case of K-fold cross-validation where k is set to the 
#         total number of samples, resulting in each sample being used as a test set once. It is computationally expensive 
#         but can be useful for small datasets.
        from sklearn.model_selection import LeaveOneOut
        loo = LeaveOneOut()


