# Hands On

## 1 - Perkenalan

>Bab pengenalan harus diisi dengan identitas, gambaran besar dataset yang digunakan, dan objective yang ingin dicapai.

Nama:

Batch:

Problem Statement:  

Objective:

## 2 - Import Libraries

> Cell pertama pada notebook harus berisi dan hanya berisi semua library yang digunakan dalam project.

In [None]:
#Import Libraries

#import library
import pandas as pd
import numpy as np

#import FE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline

#import model
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
!pip install feature_engine==1.6.2

## 3 - Data Loading

>Bagian ini berisi proses penyiapan data sebelum dilakukan eksplorasi data lebih lanjut. Proses Data Loading dapat berupa memberi nama baru untuk setiap kolom, mengecek ukuran dataset, dll.

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/Vincentim27/data/refs/heads/main/beverages.csv')
data

In [None]:
#Duplicate Dataset

data_duplicate = data.copy()

In [None]:
data.info()

In [None]:
#check dataset - 2

data.describe().T

In [None]:
df.columns = df.columns.str.replace(' ', '_')
df.columns

In [None]:
# Menambahkan 5 data baru yang berisi missing value
new_data = {'Product_ID':[1001,1002,1003,1004,1005]
            'Sales_Volume_(L)':2000,
            'Product_Category':'Water',
            'Price_per_Liter_(IDR)':5000,
            'Advertising_Spend_(USD)':10000,
            'Number_of_Retailers':[300,100,200,250,50],
            'Temperature_(°C)':[22.2,18.0,21.5,18.7,24.3],
            'Holiday_Season':[1,0,0,1,1],
            'Market_Share_(%)':2.2,
            'Competitor_Price_per_Liter_(IDR)':7000,
            }

df2 = df2._append(new_data, ignore_index=True)
df2.tail()

## 4 - Exploratory Data Analysis (EDA)

>Bagian ini berisi eksplorasi data pada dataset diatas dengan menggunakan query, grouping, visualisasi sederhana, dan lain sebagainya.

In [None]:
# Create Histogram and Scatter plot

plt.figure(figsize = (16,5))
plt.subplot(1,2,1)
sns.histplot(data['price'], kde = True, bins = 30)
plt.title('Histogram of price')

plt.subplot(1,2,2)
sns.scatterplot(x = 'volume', y = 'margin', data= data)
plt.title('volume Vs margin')
plt.show()

Statement ??

## 5 - Feature Engineering

> Bagian ini berisi proses penyiapan data untuk proses pelatihan model, seperti pembagian data menjadi train-test, transformasi data (normalisasi, encoding, dll.), dan proses-proses lain yang dibutuhkan.

#### Split between X (Features) and y (target)

In [None]:
#Splitting between 'X' and 'y'

X = data.drop(['Holiday Season'], axis = 1)
y = data['Holiday Season']
X

### Splitting between Train-Set and Test-set

In [None]:
#Splitting between train and test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 40)
print('Train Size: ', X_train.shape)
print('Test Size: ', X_test.shape)

In [None]:
X_train

### Handling Missing Value


In [None]:
X_train.isnull().sum().mean()

In [None]:
X_test.isnull().sum().mean()

In [None]:
y_train.isnull().sum().mean()

In [None]:
y_test.isnull().sum().mean()

In [None]:
# Printing rows where is null
null = df[df[''].isnull()]
null

In [None]:
# Check skewness
X_train[''].skew()

In [None]:
# Import library
from sklearn.impute import SimpleImputer

# Define imputers
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')

# Apply imputation to numeric columns
X_train[['']] = median_imputer.fit_transform(X_train[['']])
X_test[['']] = median_imputer.transform(X_test[['']])

# Apply imputation to categorical columns
X_train[['']] = mode_imputer.fit_transform(X_train[['']])
X_test[['']] = mode_imputer.transform(X_test[['']])

In [None]:
X_train.isnull().sum().mean()

In [None]:
X_test.isnull().sum().mean()

In [None]:
y_train.isnull().sum().mean()

In [None]:
y_test.isnull().sum().mean()

### Handling Outlier



In [None]:
# Create function to check skewness
def check_skewness(df, *column_names):
    return {col: df[col].skew() for col in column_names if col in df.columns}

In [None]:
skewness_results = check_skewness(X_train, 'Sales_Volume_(L)','Price_per_Liter_(IDR)','Advertising_Spend_(USD)',
            'Number_of_Retailers','Temperature_(°C)','Market_Share_(%)','Competitor_Price_per_Liter_(IDR)')

# Print skewness
for col, skewness in skewness_results.items():
    print(f"{col}: {skewness}")

In [None]:
# Initialize empty lists for each skewness category
normal_columns = []
skewed_columns = []
extreme_skewed_columns = []

# Loop through the skewness values and categorize the columns
for col, skewness in skewness_results.items():
    if skewness < -1.0 or skewness > 1.0:
        extreme_skewed_columns.append(col)
    elif abs(skewness) <= 0.5:  #or -> -0.5 <= skewness <= 0.5
        normal_columns.append(col)
    else:
        skewed_columns.append(col)

# Print the columns in each category
print(f"Normal: {normal_columns}\nSkewed: {skewed_columns}\nExtreme Skewed: {extreme_skewed_columns}")

In [None]:
# Create function to calculate outlier percentages
def calculate_outlier_percentages(df, columns, distance):
    for variable in columns:
        IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)
        lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
        upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

        outliers = df[(df[variable] < lower_boundary) | (df[variable] > upper_boundary)]
        outlier_percentage = len(outliers) / len(df) * 100

        print('Percentage of outliers in {}: {:.2f}%'.format(variable, outlier_percentage))

# Calcuate outlier percentages before handling
calculate_outlier_percentages(X_train, skewed_columns, 1.5)
calculate_outlier_percentages(X_train, extreme_skewed_columns, 3)

In [None]:
# Create a figure and two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Boxplot for skewed columns
sns.boxplot(data=X_train[skewed_columns], orient="h", ax=axes[0])
axes[0].set_title("Boxplot for Skewed Columns")
axes[0].set_xlabel("Values")
axes[0].set_ylabel("Columns")

# Boxplot for extreme skewed columns
sns.boxplot(data=X_train[extreme_skewed_columns], orient="h", ax=axes[1])
axes[1].set_title("Boxplot for Extreme Skewed Columns")
axes[1].set_xlabel("Values")
axes[1].set_ylabel("Columns")

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
# Create function to apply winsorization
def apply_winsorization(train, variables, capping_method='iqr', tail='both', fold=3):
    winsoriser = Winsorizer(capping_method=capping_method, tail=tail, fold=fold, variables=variables)
    train_capped = winsoriser.fit_transform(train)
    return train_capped

# Apply to X_train column
X_train = apply_winsorization(X_train, skewed_columns, fold=1.5)
X_train = apply_winsorization(X_train, extreme_skewed_columns)

In [None]:
# Check the outliers after handling
calculate_outlier_percentages(X_train, skewed_columns, 1.5)
calculate_outlier_percentages(X_train, extreme_skewed_columns, 3)

In [None]:
# Create a figure and two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Boxplot for skewed columns
sns.boxplot(data=X_train[skewed_columns], orient="h", ax=axes[0])
axes[0].set_title("Boxplot for Skewed Columns")
axes[0].set_xlabel("Values")
axes[0].set_ylabel("Columns")

# Boxplot for extreme skewed columns
sns.boxplot(data=X_train[extreme_skewed_columns], orient="h", whis=3, ax=axes[1])
axes[1].set_title("Boxplot for Extreme Skewed Columns")
axes[1].set_xlabel("Values")
axes[1].set_ylabel("Columns")

# Display the plot
plt.tight_layout()
plt.show()

### Feature Selection



In [None]:
!pip install phik
import phik

In [None]:
# Concatenate X_train and y_train
concat_train = pd.concat([X_train, y_train], axis=1)

# Show X_train
concat_train.head()

In [None]:
# Create function to correlate variables with default
def compute_phik_correlation(dataframe, columns):
    subset = dataframe[columns]
    correlation_matrix = subset.phik_matrix()
    return correlation_matrix['default_payment_next_month']

# Define the list of columns for each subset
columns = ['Sales_Volume_(L)','Price_per_Liter_(IDR)','Advertising_Spend_(USD)','Product_Category',
            'Number_of_Retailers','Temperature_(°C)','Market_Share_(%)','Competitor_Price_per_Liter_(IDR)']

# Compute Phi-K correlation for each set of columns and print
correlation = compute_phik_correlation(concat_train, columns)

# Print result
print(correlation)

In [None]:
#Drop column that < 0.05
X_train_cat.drop(['Sales_Volume_(L)','Price_per_Liter_(IDR)','Advertising_Spend_(USD)'], axis = 1, inplace = True)
X_test_cat.drop(['Sales_Volume_(L)','Price_per_Liter_(IDR)','Advertising_Spend_(USD)'], axis = 1, inplace = True)
X_train_cat

In [None]:
# Show columns
print(X_train.columns)
print(X_test.columns)

### Split num col cat col

In [None]:
num_columns = X_train.select_dtypes(exclude=['object']).columns.tolist() # Cara1 - not recomended
#cara 2:
num_normal = []
num_skew = []
cat_encoded = []
cat_ordinal = []
cat_nominal = []
print(f'Numerical normal columns:\n{num_normal}')
print(f'Numerical skew columns:\n{num_skew}')
print(f'Categorical encoded columns:\n{cat_encoded}')

In [None]:
# Feature scaling using standard scaler
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown="ignore",sparse_output=False) # sparse_output(atau sparse aja)=False utk mengganti -> `.toarray()`
ord = OrdinalEncoder()

preprocess = ColumnTransformer(
    transformers = [
        ('num',scaler,num_columns),
        ('nom',ohe,cat_columns),
        ('ord',ord, cat_ordinal)],
    remainder='passthrough' # untuk categorical yg sdh di encode
)

## 6. Modeling

### Membuat pipeline

In [None]:
# Model Definition using pipeline

pipe_log = make_pipeline(preprocess,LogisticRegression(max_iter=1000000))
pipe_svc = make_pipeline(preprocess,SVC())
pipe_dt = make_pipeline(preprocess,DecisionTreeClassifier(random_state=10))
pipe_rf = make_pipeline(preprocess,RandomForestClassifier(random_state=10))
pipe_knn = make_pipeline(preprocess,KNeighborsClassifier())
pipe_nb = make_pipeline(preprocess,GaussianNB())
pipe_ada = make_pipeline(preprocess,AdaBoostClassifier())

### Cross validation untuk memilih best model

In [None]:
# setting kfold
skfold = StratifiedKFold(n_splits = 5)

# Define Cross Validation for each model
cv_log_model = cross_val_score(pipe_log, X_train, y_train, cv = skfold, scoring='f1', n_jobs=-1)
cv_svm_model = cross_val_score(pipe_svc, X_train, y_train, cv = skfold, scoring='f1', n_jobs=-1)
cv_dt_model = cross_val_score(pipe_dt, X_train, y_train, cv = skfold, scoring='f1', n_jobs=-1)
cv_rf_model = cross_val_score(pipe_rf, X_train, y_train, cv = skfold, scoring='f1', n_jobs=-1)
cv_knn_model = cross_val_score(pipe_knn, X_train, y_train, cv = skfold, scoring='f1', n_jobs=-1)
cv_nb_model = cross_val_score(pipe_nb, X_train, y_train, cv = skfold, scoring='f1', n_jobs=-1)
cv_ada_model = cross_val_score(pipe_ada, X_train, y_train, cv = skfold, scoring='f1', n_jobs=-1)

In [None]:
# Finding Best Model based on Cross_Val_Score (mean)
name_model = []
cv_scores = 0
for cv,name in zip([cv_log_model,cv_svm_model,cv_dt_model,cv_rf_model,cv_knn_model,cv_nb_model,cv_ada_model],
                   ['log_model','svm_model','dt_model','rf_model','knn_model','nb_model','ada_model']):
  print(name)
  print('f1score - All - Cross Validation :', cv)
  print('f1score - Mean - Cross Validation :', cv.mean())
  print('f1score - std - Cross Validation :', cv.std())
  print('f1score - Range of Test Set :', (cv.mean()-cv.std()), '-' , (cv.mean()+cv.std()))
  print('-'*50)
  if cv.mean() > cv_scores:
    cv_scores = cv.mean()
    name_model = name
  else:
    pass
print('Best Model:', name_model)
print('Cross Val Mean from Best Model:', cv_scores)

In [None]:
# Fit pipeline on the training data
pipe_<best_model>.fit(X_train, y_train)

In [None]:
# Get predictions for both training and test data
y_pred_train = pipe_<best_model>.predict(X_train)
y_pred_test = pipe_<best_model>.predict(X_test)

### Model Evaluation before tuning

In [None]:
# Print recall score
print('Recall Score - Train Set  : ', recall_score(y_resample, y_pred_train))
print('Recall Score - Test Set   : ', recall_score(y_test, y_pred_test))

In [None]:
# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Plot confusion matrix for training data
train_matrix = ConfusionMatrixDisplay.from_estimator(pipe_rf, X_resample, y_resample, cmap='PuBu', ax=axes[0])
train_matrix.ax_.set_title('Confusion Matrix - Training Data')

# Plot confusion matrix for test data
test_matrix = ConfusionMatrixDisplay.from_estimator(pipe_rf, X_test, y_test, cmap='PuBu', ax=axes[1])
test_matrix.ax_.set_title('Confusion Matrix - Test Data')

plt.tight_layout()
plt.show()

### Simpan ke dalam table komparasi

In [None]:
# Create function to create reports
def performance_report(all_reports, y_resample, y_pred_train, y_test, y_pred_test, name):
    # Calculate recall scores
    score_reports = {
        'Recall Train Set': recall_score(y_resample, y_pred_train),
        'Recall Test Set': recall_score(y_test, y_pred_test),
    }

    # Calculate confusion matrices for train and test sets
    cm_train = confusion_matrix(y_resample, y_pred_train)
    cm_test = confusion_matrix(y_test, y_pred_test)

    # Extract false negatives from the confusion matrices and add to the report
    score_reports['False Negative Train'] = cm_train[1, 0]
    score_reports['False Negative Test'] = cm_test[1, 0]

    # Store the report in the dictionary with the specified model name
    all_reports[name] = score_reports
    return all_reports

all_reports = {}
all_reports = performance_report(all_reports, y_resample, y_pred_train, y_test, y_pred_test, 'Random Forest without Tuning')

pd.DataFrame(all_reports)

### Model tuning pakai `GridSearchCV`

In [None]:
# misal rf best model
# Set up the parameter grid for Random Forest
param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200, 300],
    'classifier__max_depth': [1, 2, 3, 4],
    'classifier__min_samples_split': [2, 3, 5, 7, 9],
    'classifier__min_samples_leaf': [3, 5, 7, 9, 11],
    'classifier__class_weight': ['balanced']
}

# Set up the GridSearchCV object for Random Forest
grid_search_rf = RandomSearchCV(pipe_rf,
                              param_grid=param_grid_rf,
                              scoring='recall',
                              cv=kf,
                              verbose=2,
                              n_jobs=-1)

# Fit GridSearchCV for Random Forest
grid_search_rf.fit(X_train, y_train)

# Best hyperparameters for Random Forest
print('Best hyperparameters for Random Forest:', grid_search_rf.best_params_)

# Best recall for Random Forest
print('Best recall for Random Forest:', grid_search_rf.best_score_)

# Save best Random Forest model to a variable
best_rf_model = grid_search_rf.best_estimator_

### Model evaluation after tuning

In [None]:
# Get predictions for training and testing set using the hyperparameter tuned model
y_pred_train_tuned = best_rf_model.predict(X_train)
y_pred_test_tuned = best_rf_model.predict(X_test)

In [None]:
# Print recall score
print('Recall Score - Train Set  : ', recall_score(y_resample, y_pred_train_tuned))
print('Recall Score - Test Set   : ', recall_score(y_test, y_pred_test_tuned))

In [None]:
# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Plot confusion matrix for training data
train_matrix = ConfusionMatrixDisplay.from_estimator(best_rf_model, X_resample, y_resample, cmap='PuBu', ax=axes[0])
train_matrix.ax_.set_title('Confusion Matrix - Training Data')

# Plot confusion matrix for test data
test_matrix = ConfusionMatrixDisplay.from_estimator(best_rf_model, X_test, y_test, cmap='PuBu', ax=axes[1])
test_matrix.ax_.set_title('Confusion Matrix - Test Data')

plt.tight_layout()
plt.show()

### Compare before-after tuning

In [None]:
# Add results to the report
all_reports = performance_report(all_reports, y_resample, y_pred_train_tuned, y_test, y_pred_test_tuned, 'Random Forest with Tuning')
pd.DataFrame(all_reports)

### Model saving for Deployment

In [None]:
# Save the best model
with open('best_rf_model.pkl', 'wb') as model_file:
  pickle.dump(best_rf_model, model_file)

---
$$-- END --$$
___