# Init

In [None]:
import sys
sys.path.append("../src/")
from random_predictor import RandomPredictor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from  sklearn.metrics import classification_report, confusion_matrix
import joblib
import os
from dotenv import load_dotenv
from typing import Tuple, Dict, Any
import warnings 
warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8-white")
load_dotenv()

In [None]:
RANDOM_STATE = int(os.getenv("RANDOM_STATE"))
DATA_SOURCE_URL = str(os.getenv("DATA_SOURCE_URL"))

# Data Wrangling

## Data Gathering

In [None]:
col_names = [
  "sepal_length", 
  "sepal_width", 
  "petal_length", 
  "petal_width", 
  "species"
]

raw_data = pd.read_csv(
  DATA_SOURCE_URL, 
  names=col_names
)
raw_data

## Assesing Data 

In [None]:
# check dataset information
raw_data.info()

In [None]:
# check missing value(s) if any
raw_data.isnull().sum()

In [None]:
# check duplicate(s) data if any
raw_data.duplicated().sum()

In [None]:
# preview duplicates data
raw_data[raw_data.duplicated(keep=0)]

## Data Cleaning

In [None]:
# copy dataframe to create new dataframe for cleaning purpose 
# and keep original raw data 
data = raw_data.copy()
data.info()

In [None]:
# from previous steps, the problem happend only duplicates data,
# so in the cleaning steps, we only remove them to keep data clean.
data.drop_duplicates(keep="first", inplace=True)
data.reset_index(inplace=True, drop=True)
data.info()

In [None]:
# convert "class" to proper datatype
data["species"] = data["species"].astype("category")
data.info()

# Splitting

In [None]:
# split the dataset into train and test
# we specify the 'stratify' parameter to keep both sets balance
# and specify 'random_state' parameter to keep 
# the separation same every run of this cell
train_set, test_set = train_test_split(
                        data,
                        test_size=.2,
                        stratify=data["species"],
                        random_state=RANDOM_STATE
                      )

# preview data size
train_set.shape, test_set.shape

In [None]:
# quick preview data after splitting
display(train_set.sample(5))
display(test_set.sample(5))

# EDA

In [None]:
# Preview the original class proportion 
plt.figure(figsize=(14,4))
plt.subplot(121)
data["species"]\
  .value_counts()\
  .sort_index()\
  .plot(
    kind="barh", 
    title="Original Class Count",
    xlabel="count"
  );
plt.subplot(122)
data["species"]\
  .value_counts(normalize=True)\
  .sort_index()\
  .plot(
    kind="barh", 
    title="Original Class Proportion",
    xlabel="percentage (%)"
  );
plt.show();

# Check the class proportion (should be balance after splitting)
plt.figure(figsize=(14,4))
plt.subplot(121)
train_set["species"]\
  .value_counts(normalize=True)\
  .sort_index()\
  .plot(
    kind="barh", 
    title="Class Proportion in Train Set",
    xlabel="percentage (%)"
  );

plt.subplot(122)
test_set["species"]\
  .value_counts(normalize=True)\
  .sort_index()\
  .plot(
    kind="barh", 
    title="Class Proportion in Test Set",
    xlabel="percentage (%)"
  );
plt.show();

In [None]:
train_set.describe()

In [None]:
# check data distributions
train_set.hist(bins=20)
plt.show()

In [None]:
# check outliers
train_set.boxplot()

#### Checkpoints  

- Should we remove the outlier?

In [None]:
# check correlation
train_set.assign(species=train_set['species'].cat.codes).corr().style.background_gradient("bwr")

In [None]:
# check correlation in visualization for better understanding
sns.pairplot(data=train_set, hue="species");

# Preprocessing

## Feature Engineering

In [None]:
# Define feature / X and class / target / y
features = list(data.columns[:-1])
target = [data.columns[-1]]
engineered_features = {
    "sepal_size": ["sepal_length", "sepal_width"],
    "petal_size": ["petal_length", "petal_width"]
}

df_fe = train_set.copy()
for feature in list(engineered_features.keys()):
    df_fe[feature] = train_set[engineered_features[feature]].prod(axis=1)
    df_fe.drop(columns=engineered_features[feature], inplace=True)

In [None]:
sns.scatterplot(data=df_fe, x="sepal_size", y="petal_size", hue="species");
plt.title("New Features from Feature Engineering");

In [None]:
df_fe.boxplot()

In [None]:
# check correlation
df_fe.assign(species=df_fe['species'].cat.codes).corr().style.background_gradient("bwr")

In [None]:
sns.pairplot(data=df_fe, hue="species")

Data Cases:
- Case 1: all features + scaled
- Case 2: engineered features + normalized

In [None]:
# Define feature / X and class / target / y
X = train_set[features] #data[features]
y = train_set[target] # data[target]

X_fe = pd.DataFrame()
for feature in list(engineered_features.keys()):
    X_fe[feature] = X[engineered_features[feature]].prod(axis=1)

display(X.head(5))
display(X_fe.head(5))
display(y.head())

## Scaling & Encoding

In [None]:
scaler = MinMaxScaler()
normalizer = StandardScaler()
encoder = LabelEncoder()

scaler.fit(X)
normalizer.fit(X_fe)
encoder.fit(y)

In [None]:
X_train = scaler.transform(X)
display(X_train[:5])

X_train_fe = normalizer.transform(X_fe)
display(X_train_fe[:5])

In [None]:
plt.figure(figsize=(14,6))
plt.subplot(121)
sns.scatterplot(data=pd.concat([X_fe, y], axis=1), x="sepal_size", y="petal_size", hue="species")
plt.title("New Features from Feature Engineering")
plt.axvline(0, c='k', linestyle="--", linewidth=.5)
plt.axhline(0, c='k', linestyle="--", linewidth=.5)


tmp = pd.DataFrame(X_train_fe, columns=normalizer.feature_names_in_)
tmp['species'] = y.values
tmp.sort_values(by="species", inplace=True)

plt.subplot(122)
sns.scatterplot(data=tmp, x="sepal_size", y="petal_size", hue="species")
plt.title("New Features from Feature Engineering (Normalized)")
plt.axvline(0, c='k', linestyle="--", linewidth=.5)
plt.axhline(0, c='k', linestyle="--", linewidth=.5)
plt.show()

In [None]:
tmp = pd.DataFrame(X_train, columns=scaler.feature_names_in_)
tmp['species'] = y.values
tmp.sort_values(by="species", inplace=True)
tmp[tmp.columns[[0,2,3]]]

In [None]:
idx_x = 1
idx_y = 3

tmp = X.copy()
tmp['species'] = y.values
tmp.sort_values(by="species", inplace=True)
tmp = tmp[tmp.columns[[idx_x, idx_y, 4]]]
plt.figure(figsize=(14, 6))
plt.subplot(121)
sns.scatterplot(data=tmp, x=tmp.columns[0], y=tmp.columns[1], hue="species")
plt.title("Original Sepal")
plt.axvline(0, c='k', linestyle="--", linewidth=.5)
plt.axhline(0, c='k', linestyle="--", linewidth=.5)

tmp = pd.DataFrame(X_train, columns=scaler.feature_names_in_)
tmp['species'] = y.values
tmp.sort_values(by="species", inplace=True)
tmp = tmp[tmp.columns[[idx_x, idx_y, 4]]]
plt.subplot(122)
sns.scatterplot(data=tmp, x=tmp.columns[0], y=tmp.columns[1], hue="species")
plt.title("Scaled Sepal")
plt.axvline(0, c='k', linestyle="--", linewidth=.5)
plt.axhline(0, c='k', linestyle="--", linewidth=.5)
plt.axvline(1, c='k', linestyle="--", linewidth=.5)
plt.axhline(1, c='k', linestyle="--", linewidth=.5)

In [None]:
y_train = encoder.transform(y)
y_train[:5]

# Build Model  

Several input data cases and models will be used:
1. Random model, as the baseline model
2. Logistic Regression with input data of all normalized features as model 1a
3. SVM with input data of all normalized features as model 2a
3. Logistic Regression with input data of standardized engineering features as model 1b
4. SVM with standardized engineering features as model 2b

In [None]:
# define models
rnd = RandomPredictor(np.unique(y_train))
lr = LogisticRegression(random_state=RANDOM_STATE, multi_class="multinomial")
lr_fe = LogisticRegression(random_state=RANDOM_STATE, multi_class="multinomial")
svm = SVC(random_state=RANDOM_STATE, kernel='linear', probability=True)
svm_fe = SVC(random_state=RANDOM_STATE, kernel='linear', probability=True)

# fit/train models
lr.fit(X_train, y_train)
lr_fe.fit(X_train_fe, y_train)
svm.fit(X_train, y_train)
svm_fe.fit(X_train_fe, y_train)

# collect models into one variable for ease in later processes
trained_models = {
    "Random": rnd,
    "LogisticRegression": lr,
    "SVM": svm,
    "LogisticRegression_FE": lr_fe,
    "SVM_FE": svm_fe,
}

# Model Evaluation


In [None]:
y_test = test_set[target]
display(y_test)
y_test = encoder.transform(y_test)
y_test

In [None]:
X_test = scaler.transform((test_set[features]))
display(X_test[:5])

X_test_fe = pd.DataFrame()
for feature in list(engineered_features.keys()):
    X_test_fe[feature] = test_set[engineered_features[feature]].prod(axis=1)
display(X_test_fe[:5])
X_test_fe = normalizer.transform(X_test_fe)
display(X_test_fe[:5])

In [None]:
for i, model in enumerate(trained_models):
  used_model = trained_models[model]
  test_input = X_test if "FE" not in model else X_test_fe
  prediction_result = used_model.predict(test_input)
  header = f" {i+1}. Test Result from: '{model}' "
  print(f"{header:=^55s}")
  print(classification_report(y_test, prediction_result))
  print(confusion_matrix(y_test, prediction_result))
  print("="*55, "\n\n")


# Test Use Model

The evaluation results show that the **SVM** model excels in both input data cases; therefore, let's try to do a prediction simulation here before creating an API.

## Local Test

In [None]:
# check species / class mapping
for iris in encoder.classes_:
  print(iris, "->", encoder.transform([iris]))

In [None]:
# Simulate input prediction
test_case = test_set.sample() 
test_case = test_set.loc[10].to_frame().T 
display(test_case)

test_case_input = test_case.drop(columns=["species"]) 
display(test_case_input)

In [None]:
# Try to make prediction from one data point
SVM = SVC()
SVM.fit(X_train, y_train)
SVM.predict(test_case_input)

#### Checkpoints  

- Is this prediction method correct?
- Why is the prediction result different from the evaluation? (in the evaluation, no mistakes in prediction for either the SVM or logistic regression model for the 'Iris-setosa' species)
- Does the model used to perform inference reflect the trained model?

In [None]:
# Try to make prediction with trained models.
# NOTE: the correct prediction should output [0], the Iris-setosa species

# predict one data point (sample)
for model in trained_models.keys():
    model_prediction = trained_models[model].predict(test_case_input)
    print(f"{model} prediction: {model_prediction}")

#### Checkpoints  

- How to fix errors?
- How do we make predictions more meaningful to users?
- Why are the prediction results not correct?

In [None]:
# interprete class / decode for better interpretation 
for i in range(3):
  # YOUR CODE HERE 

In [None]:
# do data processing same as the dev. process before pass to the model
test_case_input_normalized = # YOUR CODE HERE (transform data) 
display(test_case_input_normalized)

# predict one data point (sample)
for model in list(trained_models.keys())[:-2]:
    model_prediction = trained_models[model].predict(test_case_input_normalized)
    iris_species_prediction = encoder.inverse_transform(model_prediction)[0]
    prediction_proba = trained_models[model].predict_proba(test_case_input) * 100 # add probability as a confidence of prediction
    print(f"{model} prediction: {prediction_proba.max():.2f}% is {model_prediction} / {iris_species_prediction}")

In [None]:
# do data processing with feature engineering same as the dev. process before pass to the model
test_case_input_fe = pd.DataFrame() 
for feature in list(engineered_features.keys()):
    test_case_input_fe[feature] = test_case_input[engineered_features[feature]].prod(axis=1)
display(test_case_input_fe)

# predict one data point (sample)
test_case_input_fe_scaled = # YOUR CODE HERE (transform data) 
for model in list(trained_models.keys())[-2:]:
    model_prediction = trained_models[model].predict(test_case_input_fe_scaled)
    iris_species_prediction = encoder.inverse_transform(model_prediction)[0]
    prediction_proba = trained_models[model].predict_proba(test_case_input_fe_scaled) * 100
    print(f"{model} prediction: {prediction_proba.max():.2f}% is {model_prediction} / {iris_species_prediction}")

#### Checkpoints   

- If the model is to be used in API/Web, what kind of input can the user provide? Can the user directly input the dataframe?

## API Prep.

In [None]:
# save objects
saved_object_path = "../utils"
if "Random" in trained_models.keys(): del trained_models["Random"]

all_features = {
    "original": features,
    "engineered": engineered_features
}

joblib.dump(trained_models, f'{saved_object_path}/models.bin')
joblib.dump(df_fe, f'{saved_object_path}/data_feature_engineering.bin')

# save encoder, normalizer, scaler, and all_features
# YOUR CODE HERE

In [None]:
# sample json input as a simulation input from user
raw_user_input = {
  "sepal_width": 3.7,
  "petal_width": 0.2,
  "petal_length": 1.5,
  "sepal_length": 5.4
}
raw_user_input

In [None]:
display(test_case_input)

In [None]:
def load_objects() -> Tuple[
        StandardScaler, 
        MinMaxScaler, 
        LabelEncoder, 
        Dict[str, Any], 
        Dict[str, object]
    ]:
    """
    Load pre-saved machine learning objects including scaler, normalizer, encoder,
    feature engineering dictionary, and models.

    Returns:
        Tuple[StandardScaler, Normalizer, LabelEncoder, Dict[str, Any], Dict[str, object]]:
        - loaded_scaler: Scaler used to scale input data.
        - loaded_normalizer: Normalizer used to normalize input data.
        - loaded_encoder: Encoder used to decode model predictions.
        - loaded_features: Dictionary containing feature engineering instructions.
        - loaded_models: Dictionary of trained machine learning models.
    """

    loaded_scaler = joblib.load(f'{saved_object_path}/scaler.bin') 
    loaded_normalizer = joblib.load(f'{saved_object_path}/normalizer.bin') 
    loaded_encoder = joblib.load(f'{saved_object_path}/encoder.bin') 
    loaded_features = joblib.load(f'{saved_object_path}/features.bin') 
    loaded_models = joblib.load(f'{saved_object_path}/models.bin') 

    return loaded_scaler, loaded_normalizer, loaded_encoder, loaded_features, loaded_models

def data_pipeline_v1(
        raw_input_df: pd.DataFrame, 
        scaler: MinMaxScaler
    ) -> pd.DataFrame:
    """
    Preprocess raw input data by normalizing it to be compatible with 
    model input requirements.

    Args:
        raw_input_df (pd.DataFrame): The raw input data in DataFrame format.
        scaler (MinMaxScaler): Pre-trained scaler to normalize the data.

    Returns:
        pd.DataFrame: Normalized input data ready for model consumption.
    """
    # NOTE: Make sure the input is correct and feature names is correct
    # display(raw_input_df)

    # rearrange dataframe columns to fit normalizer
    rearranged_features = list(scaler.feature_names_in_)
    # display(rearranged_features)

    raw_input_df = raw_input_df[rearranged_features]

    # normalize data
    input_df = scaler.transform(raw_input_df)

    return input_df

def data_pipeline_v2(
        raw_input_df: pd.DataFrame, 
        features: Dict[str, Any], 
        normalizer: StandardScaler
    ) -> pd.DataFrame:
    """
    Preprocess raw input data by applying feature engineering and scaling 
    to prepare it for model input.

    Args:
        raw_input_df (pd.DataFrame): The raw input data in DataFrame format.
        features (dict): Dictionary containing feature engineering instructions.
        normalizer (StandardScaler): Pre-trained scaler to scale the data.

    Returns:
        pd.DataFrame: Scaled input data ready for model consumption.
    """
    # NOTE: Make sure the input is correct and feature names is correct
    # do feature engineering
    for feature in list(features.keys()):
        raw_input_df[feature] = raw_input_df[features[feature]].prod(axis=1)

    # rearrange dataframe columns to fit scaler
    rearranged_features = list(normalizer.feature_names_in_)
    raw_input_df = raw_input_df[rearranged_features]

    # scale data
    input_df = normalizer.transform(raw_input_df)
    
    return input_df

def prediction_pipeline(
        raw_input_from_user: dict, 
        model_name: str
    ) -> Tuple[np.ndarray, float, str]:
    """
    Run the prediction pipeline, processing raw user input and making a prediction using the specified model.

    Args:
        raw_input_from_user (dict): Dictionary of user inputs formatted for model compatibility.
        model_name (str): The name of the model to use for prediction.

    Returns:
        Tuple[np.ndarray, float, str]: 
        - prediction: The raw prediction output from the model.
        - prediction_proba: The prediction probability from the model.
        - prediction_str: The human-readable string output decoded from the model prediction.
    """
    # 1. load objects
    (scaler, normalizer, encoder, features, models) = load_objects()
    model = models[model_name]
    n_features = model.n_features_in_

    # 2. read input from user and convert to dataframe
    raw_input_df = pd.DataFrame([raw_input_from_user])

    # 3. process data / data pipeline
    if n_features == 4:
        input_data_to_model = data_pipeline_v1(raw_input_df, scaler)
    else:
        input_data_to_model = data_pipeline_v2(raw_input_df, features["engineered"], normalizer)
    
    # 4. prediction
    prediction = model.predict(input_data_to_model)
    prediction_proba = model.predict_proba(input_data_to_model).max() * 100

    # 5. interpret output
    prediction_str = encoder.inverse_transform(prediction)[0]

    return prediction, prediction_proba, prediction_str

# (loaded_scaler, loaded_normalizer, loaded_encoder, loaded_features, loaded_models) = load_objects()
# raw_user_input_df = pd.DataFrame([raw_user_input])
# display(data_pipeline_v1(raw_user_input_df,loaded_scaler ))
# display(data_pipeline_v2(raw_user_input_df, loaded_features["engineered"], loaded_normalizer))

In [None]:
prediction_pipeline(raw_user_input, "SVM")