In [None]:
# Manpulate
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px


# Pre-Processing
from sklearn.model_selection import train_test_split # train-test-split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer # detect & handle NaNs
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder # Ordinal Encoding, Nominal Encoding
from category_encoders import BinaryEncoder # Nominal Encoding
from imblearn.under_sampling import RandomUnderSampler # undersampling
from imblearn.over_sampling import RandomOverSampler, SMOTE # oversampling
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler # Scaling

# Modeling
## 1) Pipeline
from sklearn.pipeline import Pipeline, make_pipeline # to make pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector # apply pipeline to each column

## 2) Regression Models
from sklearn.linear_model import LinearRegression # if data is small and small_no_features
from sklearn.linear_model import SGDRegressor # if data is large: (can have penalty=constrains)
from sklearn.preprocessing import PolynomialFeatures # for polynomial regresion (then apply scaling after it)
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV # Regularization

## 2') Classfication Models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB, MultinomialNB
from sklearn.svm import LinearSVC, SVC, LinearSVR, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import xgboost
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier, VotingRegressor # Ensemble (Voting)
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor # Bagging & Pasting
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor # Boosting
from sklearn.ensemble import StackingClassifier, StackingRegressor # Stacking

## 3) Model Selection (Underfitting vs Overfitting) [bias variance tradeoff => perfect model complexity]
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict, KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV # (Train - Valid - Test) + hyperparameters tunning
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV # if data / features is large
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error # Evaluate Model: r2=> accuracy, L2-norm: if no outliers, L1-norm: if outliers
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score, auc, confusion_matrix
from scipy import stats # Confidence Interval of Accuracy / Loss / Utility
import joblib # save model

# 4) Dimensionality reduction
from sklearn.decomposition import PCA, IncrementalPCA # till 20K features
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection # >20k features
from sklearn.manifold import LocallyLinearEmbedding, MDS, Isomap, TSNE # Manifold could be better than Projection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # for classfication problems (larg distance between diffrent classes)

# 5) Clustering
from sklearn.cluster import KMeans, MiniBatchKMeans # spherical dataset (n_cluster by (elbow / silhouette_score / silhoutette_samples))
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN # eps by K-distanceGraph
from sklearn.cluster import HDBSCAN # state of art
from sklearn.metrics import silhouette_score, silhouette_samples, davies_bouldin_score, calinski_harabasz_score
from sklearn.neighbors import NearestNeighbors

# Understand data

In [None]:
df = pd.read_csv('diabetic_data.csv')

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# !ls /content/drive/MyDrive/final project/diabetic_data.csv
# file_path = "/content/drive/MyDrive/final project/diabetic_data.csv"
# df = pd.read_csv(file_path)
# df

In [None]:
pd.set_option('display.max_columns', None)
df.columns

In [None]:
df.head(5)

In [None]:
df.replace('?', np.nan, inplace=True)
df.head(5)

In [None]:
df[['diag_1', 'diag_2', 'diag_3']] = df[['diag_1', 'diag_2', 'diag_3']].fillna(0)

In [None]:
df.info()
# drop weight, max_glu_serum, A1Cresult, encounter_id, patient_nbr, payer_code, medical_specialty
# check why num: diag_1, diag_2, diag_3

In [None]:
df.describe()
# number_inpatient	21
# number_emergency	76
# number_outpatient	42
# num_medications	81
# num_lab_procedures	132
# time_in_hospital  14
# gender = 'Unknown/Invalid'

In [None]:
df[df['number_diagnoses'] == 2]

In [None]:
df[df['discharge_disposition_id'] == 10]

In [None]:
df[df['time_in_hospital'] >= 10]

In [None]:
df[df['num_lab_procedures'] >= 100]

In [None]:
drop_1 = df[df['number_emergency'] >= 20].index

In [None]:
drop_2 = df[df['num_lab_procedures'] >= 100].index

In [None]:
drop_3 = df[df['num_medications'] > 60].index

In [None]:
drop_4 = df[df['gender'] == 'Unknown/Invalid'].index

In [None]:
drop_6 = df[df['number_inpatient'] >= 15].index

In [None]:
drop_7 = df[df['number_outpatient'] > 10].index
drop_7

In [None]:
drop_indices = list(drop_1) + list(drop_2) + list(drop_3) + list(drop_4) + list(drop_6) + list(drop_7)

df.drop(drop_indices, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df

In [None]:
cat_col = df.select_dtypes(include='O').columns
for col in cat_col:
    print(f"the number of Uniques in {col} is {df[col].nunique()}")
    print(f"the uniques in {col}, is {df[col].unique()}")
    print()
    print("*" * 50)
    print()

# Extract New Featuers

In [None]:
def enhanced_conversion(code):
    conversion = {
        'numeric_value': 0.0,
        'is_V': 0,
        'is_E': 0,
        'is_numeric': 0
    }

    if code == 'None':
        return conversion

    code = str(code)

    if code[0].upper() == 'V':
        conversion.update({
            'is_V': 1,
            'numeric_value': float(code[1:]) if code[1:].replace('.','',1).isdigit() else 0.0
        })
    elif code[0].upper() == 'E':
        conversion.update({
            'is_E': 1,
            'numeric_value': float(code[1:]) if code[1:].replace('.','',1).isdigit() else 0.0
        })
    else:
        try:
            conversion.update({
                'is_numeric': 1,
                'numeric_value': float(code)
            })
        except:
            pass

    return conversion


for col in ['diag_1', 'diag_2', 'diag_3']:
    converted = df[col].apply(enhanced_conversion).apply(pd.Series)
    df = pd.concat([df, converted.add_prefix(f'{col}_')], axis=1)

df[['diag_1_is_V', 'diag_1_is_E', 'diag_1_numeric_value']]

In [None]:
def map_numeric_to_range(is_v, is_e, numeric_value):
    """
    Maps numeric ICD-9 codes to ranges.
    """
    if is_v == 1:
        if 1 <= numeric_value <= 91:
            return "V01-V91 (Health Status Factors)"
        else:
            return "V_Other"

    elif is_e == 1:
        if 800 <= numeric_value <= 999:
            return "E800-E999 (External Causes)"
        else:
            return "E_Other"

    else:
        value = int(numeric_value)

        if 1 <= value <= 139:
            return "001-139 (Infectious/Parasitic)"
        elif 140 <= value <= 239:
            return "140-239 (Neoplasms)"
        elif 240 <= value <= 279:
            return "240-279 (Endocrine/Metabolic)"
        elif 280 <= value <= 289:
            return "280-289 (Blood Disorders)"
        elif 290 <= value <= 319:
            return "290-319 (Mental Disorders)"
        elif 320 <= value <= 389:
            return "320-389 (Nervous System)"
        elif 390 <= value <= 459:
            return "390-459 (Circulatory)"
        elif 460 <= value <= 519:
            return "460-519 (Respiratory)"
        elif 520 <= value <= 579:
            return "520-579 (Digestive)"
        elif 580 <= value <= 629:
            return "580-629 (Genitourinary)"
        elif 630 <= value <= 679:
            return "630-679 (Pregnancy)"
        elif 680 <= value <= 709:
            return "680-709 (Skin)"
        elif 710 <= value <= 739:
            return "710-739 (Musculoskeletal)"
        elif 740 <= value <= 759:
            return "740-759 (Congenital)"
        elif 760 <= value <= 779:
            return "760-779 (Perinatal)"
        elif 780 <= value <= 799:
            return "780-799 (Symptoms)"
        elif 800 <= value <= 999:
            return "800-999 (Injury/Poisoning)"
        else:
            return "Other"

In [None]:
for col in ['diag_1', 'diag_2', 'diag_3']:
    df[f'{col}_range'] = df.apply(
        lambda row: map_numeric_to_range(
            is_v=row[f'{col}_is_V'],
            is_e=row[f'{col}_is_E'],
            numeric_value=row[f'{col}_numeric_value']
        ), axis=1
    )

df[['diag_1_numeric_value', 'diag_1_range', 'diag_2_numeric_value', 'diag_2_range']].head()

In [None]:
keys = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide',
        'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide',
        'metformin-pioglitazone', 'metformin-rosiglitazone', 'glimepiride-pioglitazone', 'glipizide-metformin',
        'troglitazone', 'tolbutamide', 'acetohexamide']

df['numchange'] = df[keys].applymap(lambda x: 1 if x not in ['No', 'Steady'] else 0).sum(axis=1)

df['numchange'].value_counts()

In [None]:
df['service_utilization'] = df['number_outpatient'] + df['number_emergency'] + df['number_inpatient']

In [None]:
df['is_emergency_admission'] = df['admission_type_id'].apply(lambda x: 1 if x in [1, 2] else 0)
df['admission_category'] = df['admission_type_id'].replace({1: 'emergency', 2: 'urgent', 3: 'elective', 4: 'newborn',
                                                             5: 'trauma', 6: 'other', 7: 'other', 8: 'other'})

In [None]:
df['discharge_to_home'] = df['discharge_disposition_id'].apply(lambda x: 1 if x == 1 else 0)

# Grouping discharge categories
df['discharge_care_level'] = df['discharge_disposition_id'].replace({
    1: 'home', 3: 'rehab', 6: 'transfer', 22: 'hospice', 23: 'hospice',
    2: 'short_hospital_stay', 5: 'transfer', 11: 'transfer', 7: 'AMA',
    10: 'transfer', 14: 'transfer', 18: 'long_hospital_stay', 8: 'death'
})

In [None]:
df['admitted_from_emergency'] = df['admission_source_id'].apply(lambda x: 1 if x == 7 else 0)

# Grouping admission sources
df['referral_source'] = df['admission_source_id'].replace({
    1: 'physician_referral', 2: 'clinic_referral', 3: 'HMO_referral',
    4: 'transfer_hospital', 5: 'transfer_healthcare_facility',
    6: 'ER', 7: 'ER', 8: 'court_law', 9: 'other'
})

In [None]:
age_mapping = {f'[{i*10}-{(i+1)*10})': i*10+5 for i in range(10)}
df['age_midpoint'] = df['age'].map(age_mapping)

In [None]:
df['A1Cresult'] = df['A1Cresult'].replace(np.nan, None)
df['max_glu_serum'] = df['max_glu_serum'].replace(np.nan, None)

In [None]:
df['A1Cresult'] = df['A1Cresult'].replace('>7', 1)
df['A1Cresult'] = df['A1Cresult'].replace('>8', 1)
df['A1Cresult'] = df['A1Cresult'].replace('Norm', 0)
df['A1Cresult'] = df['A1Cresult'].replace('None', -99)

In [None]:
df['max_glu_serum'] = df['max_glu_serum'].replace('>200', 1)
df['max_glu_serum'] = df['max_glu_serum'].replace('>300', 1)
df['max_glu_serum'] = df['max_glu_serum'].replace('Norm', 0)
df['max_glu_serum'] = df['max_glu_serum'].replace('None', -99)

In [None]:
cat_col = df.select_dtypes(include='O').columns
for col in look:
    print(f"the number of Uniques in {col} is {df[col].nunique()}")
    print(f"the uniques in {col}, is {df[col].unique()}")
    print()
    print("*" * 50)
    print()

# Uni-variate Analysis

In [None]:
num_col = df.select_dtypes(include='number').columns
fig, axes = plt.subplots(ncols=2, nrows=len(num_col), figsize=(12, len(num_col) * 5))
fig.tight_layout(pad=5.0)


for i, col in enumerate(num_col):
    sns.boxplot(x=df[col], ax=axes[i,0])
    axes[i,0].set_title(f"Box plot of {col}")

    sns.kdeplot(x=df[col], ax=axes[i,1])
    axes[i,1].set_title(f"KDE plot of {col}")

In [None]:
cat_col = df.select_dtypes(include='object').columns
for col in cat_col:
    if df[col].nunique() < 7:
        print(col)
        dff = df.groupby(col).size().reset_index(name="count").sort_values(ascending=False, by="count")
        cat_fig = px.pie(dff, names=col, values='count', title=f'distipution of {col}')
        cat_fig.show()
    elif df[col].nunique() > 7:
        print(col)
        cat_fig_2 = px.histogram(df, x=col , title=f'distribution of {col}')
        cat_fig_2.show()

In [None]:
# Drop unnecessary columns
unnecess_col = [
    'citoglipton', 'examide',
    'diag_1', 'diag_2', 'diag_3', 'age',
    'encounter_id', 'patient_nbr', 'weight',
    'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
    'payer_code', 'medical_specialty'
]

In [None]:
df.drop(columns=unnecess_col, inplace=True)

In [None]:
df['readmitted'].value_counts()

In [None]:
df['readmitted'] = df['readmitted'].map({'NO': 0, '>30': 0, '<30': 1})

# Bi-Variate Analysis

In [None]:
num_df_col = df.select_dtypes(include='number')
corr_matrix = num_df_col.corr()

plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix,
            cmap='coolwarm',
            annot=True,
            fmt=".2f",
            linewidths=0.5,
)
plt.title('Correlation Matrix')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(df, x='gender', hue='readmitted')
plt.title('distpution between Gender and readmitted')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(df, x='race', hue='readmitted')
plt.title('distpution between race and readmitted')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(df, x='diabetesMed', hue='readmitted')
plt.title('distpution between diabetesMed and readmitted')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(df, x='age_midpoint', hue='readmitted')
plt.title('distpution between age_midpoint and readmitted')
plt.show()

Multi-Variate Analysis

In [None]:
sns.pairplot(df)

# Clustring

In [None]:
unnecess_clust = ['readmitted']

dff = df.drop(columns=unnecess_clust)

In [None]:
# One-hot encode categorical features
categorical_ohe_cols = [
    'race', 'gender', 'change', 'diabetesMed','glipizide-metformin', 'admission_category',

]


# binary encode
cat_bin = ['diag_1_range', 'diag_2_range', 'diag_3_range', 'referral_source', 'discharge_care_level']


# Numerical features
numerical_cols = [
    'age_midpoint', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications',
    'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses',
    'diag_1_numeric_value', 'diag_2_numeric_value', 'diag_3_numeric_value', 'service_utilization',
    'numchange'
]

# Continuous numerical features (for log transformation)
numerical_cols_cont = ['num_lab_procedures', 'num_medications']

# Medication features (ordinal encoding)
medication_cols = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide',
    'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin', 'acarbose', 'miglitol', 'glyburide-metformin'

]

cat_order_no = [
    'metformin-pioglitazone','metformin-rosiglitazone', 'glimepiride-pioglitazone',
    'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide'
]

order_cat_spec = ['tolazamide', 'change', 'diabetesMed']



# Define ordinal encoding categories for medications
ordinal_mapping_medication = [['No', 'Steady', 'Up', 'Down']] * len(medication_cols)

ordinal_map_no_steady = [['No', 'Steady']] * len(cat_order_no)

# Define ordinal encoding categories for `order_cat`
category_spec = [['No', 'Steady','Up'], ['No', "Ch"], ['No', 'Yes']]

In [None]:
# numerical pipeline
num_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='median')),
    ("scale", StandardScaler()),
])

# continuous numerical pipeline
num_cont_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='median')),
    ("scale", StandardScaler()),
])

# one-hot encoding pipeline
cat_oht_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='most_frequent')),
    ("ohe", OneHotEncoder(drop='first'))
])

# binary encoding pipeline
cat_bin_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='most_frequent')),
    ("Binary", BinaryEncoder())
])

# ordinal encoding pipeline
cat_order_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='most_frequent')),
    ("ordinal", OrdinalEncoder(categories=category_spec))
])

# ordinal encoding for `order_cat`
cat_order_med_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='most_frequent')),
    ("ordinal", OrdinalEncoder(categories=ordinal_mapping_medication))
])

cat_order_no_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='most_frequent')),
    ("ordinal", OrdinalEncoder(categories=ordinal_map_no_steady))
])

processor = ColumnTransformer(transformers=[
    ('num_pipe', num_pipe, numerical_cols),
    ('num_cont_pipe', num_cont_pipe, numerical_cols_cont),
    ('cat_oht_pipe', cat_oht_pipe, categorical_ohe_cols),
    ('cat_bin_pipe', cat_bin_pipe, cat_bin),
    ('cat_order_pipe', cat_order_pipe, order_cat_spec),
    ('cat_order_med_pipe', cat_order_med_pipe, medication_cols),
    ('cat_order_no_pipe', cat_order_no_pipe, cat_order_no),
])

In [None]:
dff = dff.dropna(subset=['diag_1_numeric_value', 'diag_2_numeric_value', 'diag_3_numeric_value'])
dff = dff.reset_index(drop=True)

In [None]:
print(f"order_cat_spec: {len(order_cat_spec)}, category_spec: {len(category_spec)}")
print(f"medication_cols: {len(medication_cols)}, ordinal_mapping_medication: {len(ordinal_mapping_medication)}")
print(f"cat_order_no: {len(cat_order_no)}, ordinal_map_no_steady: {len(ordinal_map_no_steady)}")

In [None]:
X_processed = processor.fit_transform(dff)
X_processed

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_processed)
print(f"Reduced to {X_pca.shape[1]} dimensions")

In [None]:
np.sqrt(X_processed.shape[0] / 2)

In [None]:
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(X_processed)

# Visualize t-SNE
plt.figure(figsize=(12, 8))
plt.scatter(tsne_results[:, 0], tsne_results[:, 1], alpha=0.6)
plt.title('t-SNE Visualization of scaled Data')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

In [None]:
# # visualization using tnse 3d
# tsne = TSNE(n_components=3, perplexity=30, random_state=42)
# X_tsne = tsne.fit_transform(X_processed)

In [None]:
# tsne_df = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2', 'TSNE3'])
# tsne_df['readmitted'] = df['readmitted'].values

# # Plot
# fig = px.scatter_3d(
#     tsne_df,
#     x='TSNE1',
#     y='TSNE2',
#     z='TSNE3',
#     color='readmitted',
#     title='3D t-SNE Visualization of Patient Readmissions',
#     opacity=0.7,
#     color_discrete_sequence=px.colors.qualitative.Pastel,
#     template='plotly_white'
# )
# fig.update_layout(margin=dict(l=0, r=0, b=0, t=30))
# fig.show()


In [None]:
# # visualization using pca 3d
# pca = PCA(n_components=3)
# X_pca = pca.fit_transform(X_processed)

In [None]:
# pca_df = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2', 'PCA3'])
# pca_df['readmitted'] = df['readmitted'].values  # Color by target variable

# # Plot
# fig = px.scatter_3d(
#     pca_df,
#     x='PCA1',
#     y='PCA2',
#     z='PCA3',
#     color='readmitted',
#     title='3D PCA Visualization of Patient Readmissions',
#     opacity=0.7,
#     color_discrete_sequence=px.colors.qualitative.Vivid,
#     template='plotly_dark'
# )
# fig.update_layout(margin=dict(l=0, r=0, b=0, t=30))
# fig.show()


In [None]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components=0.95)
# X_pca = pca.fit_transform(X_processed)
# print(f"Reduced to {X_pca.shape[1]} dimensions")

In [None]:
ks = []
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, n_init=10)
    kmeans.fit(X_processed)
    ks.append(kmeans.inertia_)

# Plot to find the "elbow"
plt.plot(range(2, 10), ks)
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()

In [None]:
n_clusters = range(2, 12)
clustering_models = []
silhouette_values = []
for k in n_clusters:
    kmeans_model = KMeans(n_clusters=k, init='k-means++', n_init=10)
    kmeans_model.fit(X_processed)
    silhouette_values.append(silhouette_score(X_processed, kmeans_model.labels_))
    clustering_models.append(kmeans_model)

plt.plot(n_clusters, silhouette_values)
plt.xlabel("clusters")
plt.ylabel("silhouette")
plt.grid()
plt.show()

In [None]:
# 3) silhouette samples

from sklearn.metrics import silhouette_samples
from matplotlib.ticker import FixedLocator, FixedFormatter

plt.figure(figsize=(11, 9))

for k in range(2, 12):
    plt.subplot(5, 2, k - 1)

    y_pred = clustering_models[k - 2].labels_ # 2, 3, 4
    silhouette_coefficients = silhouette_samples(X_processed, y_pred)

    padding = len(X_processed) // 30
    pos = padding
    ticks = []
    for i in range(k):
        coeffs = silhouette_coefficients[y_pred == i]
        coeffs.sort()

        color = plt.cm.Spectral(i / k)
        plt.fill_betweenx(np.arange(pos, pos + len(coeffs)), 0, coeffs,
                          facecolor=color, edgecolor=color, alpha=0.7)
        ticks.append(pos + len(coeffs) // 2)
        pos += len(coeffs) + padding

    plt.gca().yaxis.set_major_locator(FixedLocator(ticks))
    plt.gca().yaxis.set_major_formatter(FixedFormatter(range(k)))
    if k in (3, 5):
        plt.ylabel("Cluster")

    if k in (5, 6):
        plt.gca().set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
        plt.xlabel("Silhouette Coefficient")
    else:
        plt.tick_params(labelbottom=False)

    plt.axvline(x=silhouette_values[k - 2], color="red", linestyle="--")
    plt.title(f"$k={k}$")

# save_fig("silhouette_analysis_plot")
plt.show()

In [None]:
# Step 2: Cluster
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(X_pca)

# Step 3: Visualize
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis')
plt.title('PCA + KMeans Clustering')
plt.show()

In [None]:
ks = []
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, init='k-means++', n_init=10)
    kmeans.fit(X_pca)
    ks.append(kmeans.inertia_)

# Plot to find the "elbow"
import matplotlib.pyplot as plt
plt.plot(range(2, 10), ks)
plt.xlabel("Number of Clusters")
plt.ylabel("ks")
plt.show()

In [None]:
n_clusters = range(2, 12)
clustering_models = []
silhouette_values = []
for k in n_clusters:
    kmeans_model = KMeans(n_clusters=k, init='k-means++', n_init=10)
    kmeans_model.fit(X_pca)
    silhouette_values.append(silhouette_score(X_pca, kmeans_model.labels_))
    clustering_models.append(kmeans_model)

plt.plot(n_clusters, silhouette_values)
plt.xlabel("clusters")
plt.ylabel("silhouette")
plt.grid()
plt.show()

In [None]:
# 3) silhouette samples

from sklearn.metrics import silhouette_samples
from matplotlib.ticker import FixedLocator, FixedFormatter

plt.figure(figsize=(11, 9))

for k in range(2, 12):
    plt.subplot(5, 2, k - 1)

    y_pred = clustering_models[k - 2].labels_ # 2, 3, 4
    silhouette_coefficients = silhouette_samples(X_pca, y_pred)

    padding = len(X_pca) // 30
    pos = padding
    ticks = []
    for i in range(k):
        coeffs = silhouette_coefficients[y_pred == i]
        coeffs.sort()

        color = plt.cm.Spectral(i / k)
        plt.fill_betweenx(np.arange(pos, pos + len(coeffs)), 0, coeffs,
                          facecolor=color, edgecolor=color, alpha=0.7)
        ticks.append(pos + len(coeffs) // 2)
        pos += len(coeffs) + padding

    plt.gca().yaxis.set_major_locator(FixedLocator(ticks))
    plt.gca().yaxis.set_major_formatter(FixedFormatter(range(k)))
    if k in (3, 5):
        plt.ylabel("Cluster")

    if k in (5, 6):
        plt.gca().set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
        plt.xlabel("Silhouette Coefficient")
    else:
        plt.tick_params(labelbottom=False)

    plt.axvline(x=silhouette_values[k - 2], color="red", linestyle="--")
    plt.title(f"$k={k}$")

# save_fig("silhouette_analysis_plot")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, n_init=10)
dff['cluster'] = kmeans.fit_predict(X_processed)

In [None]:
dff.groupby('cluster').size()

In [None]:
dff.columns

In [None]:
dff[['diag_1_range', 'diag_2_range', 'diag_3_range',]]

In [None]:
cluster_profile = dff.groupby('cluster').agg({
    'time_in_hospital': 'median',
    'num_medications': 'mean',
    'age_midpoint': 'median',
    'number_inpatient': 'median',
    'diag_1_numeric_value':'median',
    'diag_2_numeric_value':'median',
    'diag_3_numeric_value':'median',
    'diag_1_range': lambda x: x.mode()[0],
    'diag_2_range': lambda x: x.mode()[0],
    'diag_3_range': lambda x: x.mode()[0],
    'race': lambda x: x.mode()[0],
}).reset_index()


In [None]:
cluster_profile

In [None]:
neighbors = NearestNeighbors(n_neighbors=5)
neighbors_fit = neighbors.fit(X_pca)
distances, _ = neighbors_fit.kneighbors(X_pca)
distances = np.sort(distances[:, -1])  # Sort the k-th nearest distances

# Plot k-distance graph
plt.plot(distances)
plt.xlabel("Data Points Sorted")
plt.ylabel("5th Nearest Neighbor Distance")
plt.title("k-NN Distance Plot (Choose Elbow Point for eps)")
plt.show()

In [None]:
# stop

In [None]:
dbscan = DBSCAN(eps=0.05, min_samples=5)
labels = dbscan.fit_predict(X_pca)

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap="viridis", s=10)
plt.title("DBSCAN Clustering")
plt.show()

In [None]:
import umap
reducer = umap.UMAP(
    n_neighbors=10,       # Focus on local structure
    min_dist=0.05,        # Tighten clusters
    n_components=2,
    random_state=42,
    metric='euclidean'    # 'cosine' for high dimensional data
)

X_reduced = reducer.fit_transform(X_processed)

In [None]:
# Apply DBSCAN with chosen eps (adjust based on the elbow method)
dbscan = DBSCAN(eps=0.05, min_samples=5)
labels = dbscan.fit_predict(reducer)

# Plot results
plt.scatter(reducer[:, 0], reducer[:, 1], c=labels, cmap="viridis", s=10)
plt.title("DBSCAN Clustering")
plt.show()

In [None]:
import hdbscan

hdb = hdbscan.HDBSCAN(
    min_cluster_size=50,  # Smaller clusters
    min_samples=5,        # Fewer points to form a core point
    cluster_selection_epsilon=0.5,  # Merge nearby clusters
    alpha=1.0             # Balance cluster stability
)
labels = hdb.fit_predict(X_reduced)

plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=labels, cmap="viridis", s=10)
plt.title("HDBSCAN Clustering")
plt.show()

In [None]:
from sklearn.neighbors import NearestNeighbors

# Assign noise points to nearest cluster
noise_mask = labels == -1
if sum(noise_mask) > 0:
    nn = NearestNeighbors(n_neighbors=1).fit(X_reduced[~noise_mask])
    _, indices = nn.kneighbors(X_reduced[noise_mask])
    labels[noise_mask] = labels[~noise_mask][indices.flatten()]

In [None]:
plt.scatter(
    X_reduced[:, 0], X_reduced[:, 1],
    c=labels, cmap="viridis", s=10,
    edgecolor='none', alpha=0.8
)
plt.title("HDBSCAN Clustering (Noise Highlighted)")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.colorbar(label="Cluster Label")
plt.show()

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

mask = labels != -1
if np.unique(labels[mask]).size > 1:  # Ensure at least two clusters exist
    silhouette = silhouette_score(X_reduced[mask], labels[mask])
    davies_bouldin = davies_bouldin_score(X_reduced[mask], labels[mask])
    calinski_harabasz = calinski_harabasz_score(X_reduced[mask], labels[mask])

    print(f"Silhouette Score (higher is better): {silhouette:.2f}")
    print(f"Davies-Bouldin Score (lower is better): {davies_bouldin:.2f}")
    print(f"Calinski-Harabasz Score (higher is better): {calinski_harabasz:.2f}")
else:
    print("Not enough clusters detected for meaningful evaluation.")


In [None]:
df_with_clusters = df.copy()
df_with_clusters["Cluster"] = labels

# Analyze the number of points in each cluster
cluster_counts = df_with_clusters["Cluster"].value_counts().sort_index()
print(cluster_counts)

In [None]:
noise_points = df_with_clusters[df_with_clusters["Cluster"] == -1]
noise_points.shape[0]

In [None]:
cluster_0 = df_with_clusters[df_with_clusters["Cluster"] == 0]
cluster_0.describe()

In [None]:
# Plot histogram of cluster counts
plt.bar(cluster_counts.index, cluster_counts.values, color="skyblue")
plt.xlabel("Cluster")
plt.ylabel("Number of Points")
plt.title("Cluster Distribution")
plt.show()

In [None]:
cluster_profile = df_with_clusters.groupby('Cluster').agg({
    'time_in_hospital': 'median',
    'num_medications': 'mean',
    'age_midpoint': 'median',
    'number_inpatient': 'median',
    'diag_1_numeric_value':'median',
    'diag_2_numeric_value':'median',
    'diag_3_numeric_value':'median',
    'diag_1_range': lambda x: x.mode()[0],
    'diag_2_range': lambda x: x.mode()[0],
    'diag_3_range': lambda x: x.mode()[0],
    'race': lambda x: x.mode()[0],
    'readmitted': lambda x: x.mode()[0],
}).reset_index()
cluster_profile

In [None]:
print(cluster_profile)

In [None]:
numeric_features = df_with_clusters.select_dtypes(include=[np.number]).columns

sns.set_style("whitegrid")

for feature in numeric_features:
    plt.figure(figsize=(10, 6))

    for cluster in sorted(df_with_clusters["Cluster"].unique()):
        cluster_data = df_with_clusters[df_with_clusters["Cluster"] == cluster][feature]
        sns.histplot(
            cluster_data, bins=20, kde=True, label=f"Cluster {cluster}", alpha=0.5
        )

    plt.xlabel(feature, fontsize=12)
    plt.ylabel("Density", fontsize=12)
    plt.title(f"Feature Distribution by Cluster: {feature}", fontsize=14)
    plt.legend(title="Cluster")
    plt.tight_layout()
    plt.show()

# Analyze noise points (if they exist)
if 'Cluster' in df_with_clusters.columns and -1 in df_with_clusters["Cluster"].values:
    noise_points = df_with_clusters[df_with_clusters["Cluster"] == -1]
    print("Noise point summary statistics:\n", noise_points.describe())


In [None]:
# stop

In [None]:
# from google.colab import files

# # Save DataFrame
# output_filename = "clustered_data.csv"
# df_with_clusters.to_csv(output_filename, index=False)

# # Download the file
# files.download(output_filename)


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# !ls /content/drive/MyDrive/final project/

In [None]:
# file_path_2 = "/content/drive/MyDrive/final project/clustered_data.csv"
# df_with_clusters = pd.read_csv(file_path_2)
# df_with_clusters

In [None]:
df_with_clusters = pd.read_csv('clustered_data.csv')
df_with_clusters

In [None]:
cat_col = df_with_clusters.select_dtypes(include='O').columns
for col in cat_col:
    print(f"the number of Uniques in {col} is {df_with_clusters[col].nunique()}")
    print(f"the uniques in {col}, is {df_with_clusters[col].unique()}")
    print()
    print("*" * 50)
    print()

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
df_with_clusters

In [None]:
print(df_with_clusters[['diag_2_numeric_value', 'diag_3_numeric_value', 'diag_1_numeric_value']].describe())

# Pre-Processing

Pre-Processing
* a) Detect & Handle Duplicates
* b) train_test_split
* c) Detect & Handle NaNs
* d) Detect & Handle Outliers
* e) Encoding: (Ordinal:[OrdinalEncoder, LabelEncoder] - Nominal: [< 7 uniques(OneHotEncoding), > 7 uniques (BinaryEncoder)])
* f) Imbalanced: X_train_resampled
* g) Scaling: StandardScaler, MinMaxScaler, RobustScaler: X_train_resampled_scaled

In [None]:
df_with_clusters.columns

a) Detect & Handle Duplicates

In [None]:
df_with_clusters.duplicated().sum()

In [None]:
df_with_clusters.drop_duplicates(inplace = True)
df_with_clusters.reset_index(inplace= True, drop= True)

b) train_test_split

In [None]:
X = df_with_clusters.drop('readmitted', axis=1)
y = df_with_clusters['readmitted']

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class loge_transformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        self.n_featuers_in_ = X.shape[1]
        return self

    def transform(self, X, y=None):
        assert X.shape[1] == self.n_featuers_in_
        return np.log(X)

In [None]:
# One-hot encode categorical features
categorical_ohe_cols = [
    'race', 'gender', 'change', 'diabetesMed','glipizide-metformin', 'admission_category',

]


# binary encode
cat_bin = ['diag_1_range', 'diag_2_range', 'diag_3_range', 'referral_source', 'discharge_care_level']


# Numerical features
numerical_cols = [
    'age_midpoint', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications',
    'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses',
    'diag_1_numeric_value', 'diag_2_numeric_value', 'diag_3_numeric_value', 'service_utilization',
    'numchange'
]

# Continuous numerical features (for log transformation)
numerical_cols_cont = ['num_lab_procedures', 'num_medications']

# Medication features (ordinal encoding)
medication_cols = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide',
    'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin', 'acarbose', 'miglitol', 'glyburide-metformin'

]

cat_order_no = [
    'metformin-pioglitazone','metformin-rosiglitazone', 'glimepiride-pioglitazone',
    'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide'
]

order_cat_spec = ['tolazamide', 'change', 'diabetesMed']



# Define ordinal encoding categories for medications
ordinal_mapping_medication = [['No', 'Steady', 'Up', 'Down']] * len(medication_cols)

ordinal_map_no_steady = [['No', 'Steady']] * len(cat_order_no)

# Define ordinal encoding categories for `order_cat`
category_spec = [['No', 'Steady','Up'], ['No', "Ch"], ['No', 'Yes']]

In [None]:
# numerical pipeline
num_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='median')),
    ("scale", StandardScaler()),
])

# continuous numerical pipeline
num_cont_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='median')),
    ("scale", StandardScaler()),
])

# one-hot encoding pipeline
cat_oht_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='most_frequent')),
    ("ohe", OneHotEncoder(drop='first'))
])

# binary encoding pipeline
cat_bin_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='most_frequent')),
    ("Binary", BinaryEncoder())
])

# ordinal encoding pipeline
cat_order_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='most_frequent')),
    ("ordinal", OrdinalEncoder(categories=category_spec))
])

# ordinal encoding for `order_cat`
cat_order_med_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='most_frequent')),
    ("ordinal", OrdinalEncoder(categories=ordinal_mapping_medication))
])

cat_order_no_pipe = Pipeline(steps=[
    ("handle_nans", SimpleImputer(strategy='most_frequent')),
    ("ordinal", OrdinalEncoder(categories=ordinal_map_no_steady))
])

processor = ColumnTransformer(transformers=[
    ('num_pipe', num_pipe, numerical_cols),
    ('num_cont_pipe', num_cont_pipe, numerical_cols_cont),
    ('cat_oht_pipe', cat_oht_pipe, categorical_ohe_cols),
    ('cat_bin_pipe', cat_bin_pipe, cat_bin),
    ('cat_order_pipe', cat_order_pipe, order_cat_spec),
    ('cat_order_med_pipe', cat_order_med_pipe, medication_cols),
    ('cat_order_no_pipe', cat_order_no_pipe, cat_order_no),
])

In [None]:
X_train_preprocessor = processor.fit_transform(X_train)
X_test_preprocessor = processor.transform(X_test)
X_test_preprocessor

In [None]:
print(X.columns)

In [None]:
y_train_preprocessor = y_train
y_test_preprocessor = y_test
sns.boxplot(y_train_preprocessor)

In [None]:
print(len(X_test), len(y_test))

In [None]:
smote_pipeline = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42))
])
X_train_resampled, y_train_resampled = smote_pipeline.fit_resample(X_train, y_train)

In [None]:
X_train_real, X_val, y_train_real, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True, stratify=y_train)

In [None]:
print(y_train_resampled.value_counts())

# Modeling

## SimpleModel (ex: lin_reg, liner_svm, knn, NB)

In [None]:
log_reg = LogisticRegression(C=1, random_state=42, max_iter=1000, class_weight='balanced')
log_reg.fit(X_train_resampled, y_train_resampled)

In [None]:
y_train_pred = log_reg.predict(X_train_resampled)
valid_acc = cross_val_score(log_reg, X_train_resampled, y_train_resampled, cv=5)
print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_train_pred)}")
print(f"Validation Accuracy: {valid_acc.mean()}")

In [None]:
y_vald_log_pred = cross_val_predict(log_reg, X_train_resampled, y_train_resampled, cv=5)
confusion_matrix(y_train_resampled, y_vald_log_pred)

In [None]:
percison_score_log = precision_score(y_true=y_train_resampled, y_pred=y_vald_log_pred)
recall_score_log = recall_score(y_true=y_train_resampled, y_pred=y_vald_log_pred)
f1_score_log = f1_score(y_true=y_train_resampled, y_pred=y_vald_log_pred)
print(f"Precision Score: {percison_score_log}")
print(f"Recall Score: {recall_score}")
print(f"F1 Score: {f1_score}")

In [None]:
Lin_svc_mod = LinearSVC(random_state=42, C=10, max_iter=10000)

# Standard K-Fold CV (Not ideal for imbalanced data)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf_scores = cross_val_score(Lin_svc_mod, X_train_resampled, y_train_resampled, cv=kf, scoring='accuracy')

# Stratified K-Fold CV (Better for imbalanced data)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf_scores = cross_val_score(Lin_svc_mod, X_train_resampled, y_train_resampled, cv=skf, scoring='accuracy')

print(f"K-Fold CV Scores: {kf_scores}, Mean: {kf_scores.mean():.4f}")
print(f"Stratified K-Fold CV Scores: {skf_scores}, Mean: {skf_scores.mean():.4f}")


In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train_resampled, y_train_resampled)
y_nb_pred = nb_model.predict(X_train_resampled)


valid_nb_acc = cross_val_score(nb_model, X_train_resampled, y_train_resampled, cv=5)
print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_nb_pred)}")
print(f"Validation Accuracy: {valid_nb_acc.mean()}")


In [None]:
svm_model = SVC(C=1, kernel='linear', random_state=42)

svm_model.fit(X_train_resampled, y_train_resampled)
y_svm_pred = svm_model.predict(X_train_resampled)

svm_train_acc = accuracy_score(y_train_resampled, y_svm_pred)

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
valid_svm_acc = cross_val_score(svm_model, X_train_resampled, y_train_resampled, cv=skf, scoring='accuracy', n_jobs=-1)

print(f"Train Accuracy: {svm_train_acc:.4f}")
print(f"Validation Accuracy: {valid_svm_acc.mean():.4f}")

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

sgd_model = SGDClassifier(loss="hinge", alpha=1e-4, max_iter=1000, random_state=42 ,class_weight='balanced', n_jobs=-1)

skf_sgd_scores = cross_val_score(sgd_model, X_train_resampled, y_train_resampled, cv=skf, scoring='accuracy', n_jobs=-1)

print(f"Optimized Stratified K-Fold CV Scores: {skf_sgd_scores}, Mean: {skf_sgd_scores.mean():.4f}")


In [None]:
# poly svc
poly_svm = SVC(C=1, kernel='poly', degree=2, random_state=42, class_weight='balanced')
poly_svm.fit(X_train_resampled, y_train_resampled)
poly_svm_pred = poly_svm.predict(X_train_resampled)

In [None]:
valid_acc = cross_val_score(poly_svm, X_train_resampled, y_train_resampled, cv=3)
print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=poly_svm_pred)}")
print(f"Validation Accuracy: {valid_acc.mean()}")

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=10, weights='uniform')

knn_model.fit(X_train_resampled, y_train_resampled)
y_knn_pred = knn_model.predict(X_train_resampled)
valid_knn_acc = cross_val_score(knn_model, X_train_resampled, y_train_resampled, cv=5)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_knn_pred)}")
print(f"Validation Accuracy: {valid_knn_acc.mean()}")

## Complex Model

In [None]:
svm_rbf_model = SVC(C=10, kernel='rbf', random_state=42, gamma=1, probability=True)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf_scores = cross_val_score(svm_rbf_model, X_train_resampled, y_train_resampled, cv=skf, scoring='accuracy')

svm_rbf_model.fit(X_train_resampled, y_train_resampled)
y_svm_rbf_pred = svm_rbf_model.predict(X_train_resampled)

print(f"train acc {accuracy_score(y_train_resampled, y_pred=y_svm_rbf_pred)}")
print(f"Stratified K-Fold CV Scores: {skf_scores}, Mean: {skf_scores.mean():.4f}")

In [None]:
param_grid = {
    'C': [1, 5, 10, 20, 50],
    'gamma': ['scale', 0.1, 1, 5]
}

svm_rbf_model = SVC(kernel='rbf', random_state=42, probability=True)

grid_search = GridSearchCV(svm_rbf_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train_resampled, y_train_resampled)

print("Best parameters:", grid_search.best_params_)

best_svm = grid_search.best_estimator_
print(f"Train Accuracy: {accuracy_score(y_train_resampled, best_svm.predict(X_train_resampled))}")
print(f"Validation Accuracy (CV): {grid_search.best_score_:.4f}")

In [None]:
dt_model = DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=10, min_samples_leaf=5)
dt_model.fit(X_train_resampled, y_train_resampled)

# Predict on test set
y_dt_pred = dt_model.predict(X_train_resampled)
valid_dt_acc = cross_val_score(dt_model, X_train_resampled, y_train_resampled, cv=5)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_dt_pred)}")
print(f"Validation Accuracy: {valid_dt_acc.mean()}")

In [None]:
param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}
grid_dt_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_dt_search.fit(X_train_resampled, y_train_resampled)

print("Best parameters:", grid_dt_search.best_params_)

best_dt = grid_dt_search.best_estimator_
print(f"Train Accuracy: {accuracy_score(y_train_resampled, best_dt.predict(X_train_resampled))}")
print(f"Validation Accuracy (CV): {grid_dt_search.best_score_:.4f}")

## Bagging

In [None]:
rfc_model_1 = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=7, min_samples_split=2, min_samples_leaf=2)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_rfc_1_acc = cross_val_score(rfc_model_1, X_train_resampled, y_train_resampled, cv=skf)

rfc_model_1.fit(X_train_resampled, y_train_resampled)
y_rfc_model_1_pred = rfc_model_1.predict(X_train_resampled)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_rfc_model_1_pred)}")
print(f"Validation Accuracy: {valid_rfc_1_acc.mean()}")

In [None]:
rfc_model = RandomForestClassifier(
    n_estimators=1200,
    max_depth=10,
    min_samples_split=4,
    min_samples_leaf=3,
    random_state=42
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_rfc_acc = cross_val_score(rfc_model, X_train_resampled, y_train_resampled, cv=skf)

rfc_model.fit(X_train_resampled, y_train_resampled)
y_rfc_model_pred = rfc_model.predict(X_train_resampled)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_rfc_model_pred)}")
print(f"Validation Accuracy: {valid_rfc_acc.mean()}")

In [None]:
y_vald_rfc_pred = cross_val_predict(rfc_model, X_train_resampled, y_train_resampled, cv=skf)
confusion_matrix(y_train_resampled, y_vald_rfc_pred)

In [None]:
percison_rfc_score = precision_score(y_true=y_train_resampled, y_pred=y_vald_rfc_pred)
recall_rfc_score = recall_score(y_true=y_train_resampled, y_pred=y_vald_rfc_pred)
f1_rfc_score = f1_score(y_true=y_train_resampled, y_pred=y_vald_rfc_pred)
print(f"Precision Score: {percison_rfc_score}")
print(f"Recall Score: {recall_rfc_score}")
print(f"F1 Score: {f1_rfc_score}")

In [None]:
param_dist = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)

# Randomized Search
random_search = RandomizedSearchCV(rf, param_dist, cv=5, scoring='roc_auc', n_jobs=-1, n_iter=10, random_state=42)
random_search.fit(X_train_resampled, y_train_resampled)

print("Best parameters:", random_search.best_params_)

best_rf = random_search.best_estimator_
print(f"Train Accuracy: {accuracy_score(y_train_resampled, best_rf.predict(X_train_resampled))}")
print(f"Validation Accuracy (CV): {random_search.best_score_:.4f}")

In [None]:
rfc_model_4 = RandomForestClassifier(n_estimators=500, random_state=42, max_depth=30, min_samples_split=2, min_samples_leaf=4, bootstrap=False)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_rfc_4_acc = cross_val_score(rfc_model_4, X_train_resampled, y_train_resampled, cv=skf)

rfc_model_4.fit(X_train_resampled, y_train_resampled)
y_rfc_model_4_pred = rfc_model_4.predict(X_train_resampled)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_rfc_model_4_pred)}")
print(f"Validation Accuracy: {valid_rfc_4_acc.mean()}")

In [None]:
rfc_model_3 = RandomForestClassifier(n_estimators=500, random_state=42, max_depth=20, min_samples_split=2, min_samples_leaf=4, bootstrap=False)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_rfc_3_acc = cross_val_score(rfc_model_3, X_train_resampled, y_train_resampled, cv=skf)

rfc_model_3.fit(X_train_resampled, y_train_resampled)
y_rfc_model_3_pred = rfc_model_3.predict(X_train_resampled)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_rfc_model_3_pred)}")
print(f"Validation Accuracy: {valid_rfc_3_acc.mean()}")

In [None]:
rfc_model_5 = RandomForestClassifier(n_estimators=400, random_state=42, max_depth=15, min_samples_split=2, min_samples_leaf=4, bootstrap=False)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_rfc_5_acc = cross_val_score(rfc_model_5, X_train_resampled, y_train_resampled, cv=skf)

rfc_model_5.fit(X_train_resampled, y_train_resampled)
y_rfc_model_5_pred = rfc_model_5.predict(X_train_resampled)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_rfc_model_5_pred)}")
print(f"Validation Accuracy: {valid_rfc_5_acc.mean()}")

In [None]:
y_vald_rfc5_pred = cross_val_predict(rfc_model_5, X_train_resampled, y_train_resampled, cv=skf)
confusion_matrix(y_train_resampled, y_vald_rfc5_pred)

In [None]:
percison_rfc5_score = precision_score(y_true=y_train_resampled, y_pred=y_vald_rfc5_pred)
recall_rfc5_score = recall_score(y_true=y_train_resampled, y_pred=y_vald_rfc5_pred)
f1_rfc5_score = f1_score(y_true=y_train_resampled, y_pred=y_vald_rfc5_pred)
print(f"Precision Score: {percison_rfc5_score}")
print(f"Recall Score: {recall_rfc5_score}")
print(f"F1 Score: {f1_rfc5_score}")

In [None]:
param_dist = {
    'rfc__n_estimators': [100, 300, 500],
    'rfc__max_depth': [10, 20, 30, None],
    'rfc__min_samples_split': [2, 5, 10],
    'rfc__min_samples_leaf': [1, 2, 4],
    'rfc__bootstrap': [True, False]
}


train_pipeline = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('rfc', RandomForestClassifier(random_state=42))
])

grid_rfc_search = GridSearchCV(train_pipeline, param_dist, cv=skf, scoring='accuracy', n_jobs=-1)

valid_accuracy_score = []
valid_f1_score = []
valid_precision_score = []
valid_recall_score = []

skfolds = StratifiedKFold(n_splits=5)
i = 1
for train_indx, valid_indx in skfolds.split(X_train, y_train): # 150 Model
    print(f"At fold {i}")
    # print(y_train.iloc[train_indx].value_counts(normalize=True))
    # print(y_train.iloc[valid_indx].value_counts(normalize=True))
    # print()

    grid_rfc_search.fit(X_train.iloc[train_indx], y_train.iloc[train_indx]) # 30 Model
    best_svm_clf_model = grid_rfc_search.best_estimator_
    y_valid_pred = grid_rfc_search.predict(X_train.iloc[valid_indx])

    valid_accuracy_score.append(accuracy_score(y_train.iloc[valid_indx], y_valid_pred))
    valid_f1_score.append(f1_score(y_train.iloc[valid_indx], y_valid_pred))
    valid_precision_score.append(precision_score(y_train.iloc[valid_indx], y_valid_pred))
    valid_recall_score.append(recall_score(y_train.iloc[valid_indx], y_valid_pred))

    i += 1



print(f"Average Valid Accuracy: {np.mean(valid_accuracy_score)}") # Valid accuracy
print(f"Average Valid F1 Score: {np.mean(valid_f1_score)}") # Valid F1
print(f"Average Valid Precsion: {np.mean(valid_precision_score)}") # Valid Precsion
print(f"Average Valid Recall: {np.mean(valid_recall_score)}") # Valid Recall

In [None]:
param_dist = {
    'rfc__n_estimators': [100, 300, 500],
    'rfc__max_depth': [10, 20, 30, None],
    'rfc__min_samples_split': [2, 5, 10],
    'rfc__min_samples_leaf': [1, 2, 4],
    'rfc__bootstrap': [True, False]
}



train_pipeline = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('rfc', RandomForestClassifier(random_state=42))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_rfc_search = GridSearchCV(train_pipeline, param_dist, cv=skf, scoring='accuracy', n_jobs=-1)


scoring = {
    'accuracy': 'accuracy',
    'f1': 'f1',
    'precision': 'precision',
    'recall': 'recall'
}

cv_results = cross_validate(
    estimator=grid_rfc_search,
    X=X_train,
    y=y_train,
    cv=skf,
    scoring=scoring,
    return_train_score=False,
    return_estimator=True
)

grid_rfc_search.fit(X_train, y_train)
best_rfc_model = grid_rfc_search.best_estimator_
best_params = grid_rfc_search.best_params_

valid_accuracy_score = cv_results['test_accuracy']
valid_f1_score = cv_results['test_f1']
valid_precision_score = cv_results['test_precision']
valid_recall_score = cv_results['test_recall']

print(f"Best Parameters: {best_params}")
print(f"Average Valid Accuracy: {np.mean(valid_accuracy_score):.4f}")
print(f"Average Valid F1 Score: {np.mean(valid_f1_score):.4f}")
print(f"Average Valid Precision: {np.mean(valid_precision_score):.4f}")
print(f"Average Valid Recall: {np.mean(valid_recall_score):.4f}")

print("\nDetailed fold results:")
for fold in range(5):
    print(f"Fold {fold + 1}: "
          f"Accuracy={valid_accuracy_score[fold]:.4f}, "
          f"F1={valid_f1_score[fold]:.4f}, "
          f"Precision={valid_precision_score[fold]:.4f}, "
          f"Recall={valid_recall_score[fold]:.4f}")

In [None]:
rfc_model_5_pipe = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('rfc', RandomForestClassifier(n_estimators=300, random_state=42, 
                                  max_depth=35, min_samples_split=2, 
                                  min_samples_leaf=4, bootstrap=False))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'precision', 'recall']
cv_results = cross_validate(
    rfc_model_5_pipe,
    X_train,
    y_train,
    cv=skf,
    scoring=scoring,
    n_jobs=-1
)

rfc_model_5_pipe.fit(X_train, y_train)
y_rfc_model_5_pred = rfc_model_5_pipe.predict(X_train)

print("\nPerformance Metrics:")
print(f"Train Accuracy: {accuracy_score(y_true=y_train, y_pred=y_rfc_model_5_pred)}")
print(f"CV Accuracy: {np.mean(cv_results['test_accuracy']):.4f}")
print(f"CV F1: {np.mean(cv_results['test_f1']):.4f}")
print(f"CV Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"CV Recall: {np.mean(cv_results['test_recall']):.4f}")

In [None]:
extra_trees_model = ExtraTreesClassifier(n_estimators=1000, random_state=42, max_depth=5, min_samples_leaf=2, min_samples_split=2)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_extra_tree_acc = cross_val_score(extra_trees_model, X_train_resampled, y_train_resampled, cv=skf)

extra_trees_model.fit(X_train_resampled, y_train_resampled)
y_extra_trees_pred = extra_trees_model.predict(X_train_resampled)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_extra_trees_pred)}")
print(f"Validation Accuracy: {valid_extra_tree_acc.mean()}")

In [None]:
extra_trees_model_pipe = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('et', ExtraTreesClassifier(n_estimators=1000, random_state=42, max_depth=5, min_samples_leaf=2, min_samples_split=2))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'precision', 'recall']
cv_results = cross_validate(
    extra_trees_model_pipe,
    X_train,
    y_train,
    cv=skf,
    scoring=scoring,
    n_jobs=-1
)

extra_trees_model_pipe.fit(X_train, y_train)
y_ext_pred = extra_trees_model_pipe.predict(X_train)

print("\nPerformance Metrics:")
print(f"Train Accuracy: {accuracy_score(y_true=y_train, y_pred=y_ext_pred)}")
print(f"CV Accuracy: {np.mean(cv_results['test_accuracy']):.4f}")
print(f"CV F1: {np.mean(cv_results['test_f1']):.4f}")
print(f"CV Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"CV Recall: {np.mean(cv_results['test_recall']):.4f}")

## boosting

In [None]:
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_gb = cross_val_score(gb_model, X_train_resampled, y_train_resampled, cv=skf)

gb_model.fit(X_train_resampled, y_train_resampled)
y_gb_pred = gb_model.predict(X_train_resampled)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_gb_pred)}")
print(f"Validation Accuracy: {valid_gb.mean()}")

In [None]:
y_vald_gb_pred = cross_val_predict(gb_model, X_train_resampled, y_train_resampled, cv=skf)
confusion_matrix(y_train_resampled, y_vald_gb_pred)

In [None]:
percison_gb_score = precision_score(y_true=y_train_resampled, y_pred=y_vald_gb_pred)
recall_gb_score = recall_score(y_true=y_train_resampled, y_pred=y_vald_gb_pred)
f1_gb_score = f1_score(y_true=y_train_resampled, y_pred=y_vald_gb_pred)
print(f"Precision Score: {percison_gb_score}")
print(f"Recall Score: {recall_gb_score}")
print(f"F1 Score: {f1_gb_score}")

In [None]:
y_train_resampled.value_counts(normalize=True)


In [None]:
gb_model_3 = GradientBoostingClassifier(
    n_estimators=300,           # More than 100, but not as heavy as 1000
    learning_rate=0.05,         # Lower learning rate to improve generalization
    max_depth=4,                # Slightly deeper to capture more patterns
    subsample=0.8,              # Stochastic GBM to speed up training
    max_features='sqrt',        # Use a subset of features for efficiency
    random_state=42
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_gb_3 = cross_val_score(gb_model_3, X_train_resampled, y_train_resampled, cv=skf)

gb_model_3.fit(X_train_resampled, y_train_resampled)
y_gb_3_pred = gb_model_3.predict(X_train_resampled)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_gb_3_pred)}")
print(f"Validation Accuracy: {valid_gb_3.mean()}")


In [None]:
y_vald_gb3_pred = cross_val_predict(gb_model_3, X_train_resampled, y_train_resampled, cv=skf)
confusion_matrix(y_train_resampled, y_vald_gb3_pred)

In [None]:
percison_gb3_score = precision_score(y_true=y_train_resampled, y_pred=y_vald_gb3_pred)
recall_gb3_score = recall_score(y_true=y_train_resampled, y_pred=y_vald_gb3_pred)
f1_gb3_score = f1_score(y_true=y_train_resampled, y_pred=y_vald_gb3_pred)
print(f"Precision Score: {percison_gb3_score}")
print(f"Recall Score: {recall_gb3_score}")
print(f"F1 Score: {f1_gb3_score}")

In [None]:
gb_model_pipe = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('gb', GradientBoostingClassifier(
    n_estimators=300,           # More than 100, but not as heavy as 1000
    learning_rate=0.05,         # Lower learning rate to improve generalization
    max_depth=4,                # Slightly deeper to capture more patterns
    subsample=0.8,              # Stochastic GBM to speed up training
    max_features='sqrt',        # Use a subset of features for efficiency
    random_state=42
))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'precision', 'recall']
cv_results = cross_validate(
    gb_model_pipe,
    X_train,
    y_train,
    cv=skf,
    scoring=scoring,
    n_jobs=-1
)

gb_model_pipe.fit(X_train, y_train)
y_gb_model_pred = gb_model_pipe.predict(X_train)

print("\nPerformance Metrics:")
print(f"Train Accuracy: {accuracy_score(y_true=y_train, y_pred=y_gb_model_pred)}")
print(f"CV Accuracy: {np.mean(cv_results['test_accuracy']):.4f}")
print(f"CV F1: {np.mean(cv_results['test_f1']):.4f}")
print(f"CV Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"CV Recall: {np.mean(cv_results['test_recall']):.4f}")

In [None]:
ada_boost_cl = AdaBoostClassifier(n_estimators=100, learning_rate=0.5, random_state=42)

ada_boost_cl.fit(X_train_resampled, y_train_resampled)
y_ada_cl_pred = ada_boost_cl.predict(X_train_resampled)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_c1_ada_acc = cross_val_score(ada_boost_cl, X_train_resampled, y_train_resampled, cv=skf)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_ada_cl_pred)}")
print(f"Validation Accuracy: {valid_c1_ada_acc.mean()}")

In [None]:
ada_boost = AdaBoostClassifier(
    n_estimators=200,
    learning_rate=0.3,
    random_state=42
)
ada_boost.fit(X_train_resampled, y_train_resampled)
y_ada_pred = ada_boost.predict(X_train_resampled)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_ada_acc = cross_val_score(ada_boost, X_train_resampled, y_train_resampled, cv=skf)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_ada_pred)}")
print(f"Validation Accuracy: {valid_ada_acc.mean()}")


In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}

ada_boost_cl = AdaBoostClassifier(random_state=42)

grid_search = GridSearchCV(ada_boost_cl, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

best_ada_boost = grid_search.best_estimator_

y_pred = best_ada_boost.predict(X_train_resampled)
train_accuracy = accuracy_score(y_train_resampled, y_pred)
print(f"Train Accuracy: {train_accuracy}")


In [None]:
ada_boost_grid = AdaBoostClassifier(
    n_estimators=200,        
    learning_rate=1.0,       
    random_state=42,
    algorithm='SAMME'
)
ada_boost_grid.fit(X_train_resampled, y_train_resampled)
y_ada_grid_pred = ada_boost_grid.predict(X_train_resampled)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_ada_grid_acc = cross_val_score(ada_boost_grid, X_train_resampled, y_train_resampled, cv=skf)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_ada_grid_pred)}")
print(f"Validation Accuracy: {valid_ada_grid_acc.mean()}")


In [None]:
y_vald_ada_pred = cross_val_predict(ada_boost_grid, X_train_resampled, y_train_resampled, cv=skf)
confusion_matrix(y_train_resampled, y_vald_ada_pred)

In [None]:
percison_ada_score = precision_score(y_true=y_train_resampled, y_pred=y_vald_ada_pred)
recall_ada_score = recall_score(y_true=y_train_resampled, y_pred=y_vald_ada_pred)
f1_ada_score = f1_score(y_true=y_train_resampled, y_pred=y_vald_ada_pred)
print(f"Precision Score: {percison_ada_score}")
print(f"Recall Score: {recall_ada_score}")
print(f"F1 Score: {f1_ada_score}")

In [None]:
ada_model_pipe = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('ada', AdaBoostClassifier(
    n_estimators=200,
    learning_rate=0.3,
    random_state=42
))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'precision', 'recall']
cv_results = cross_validate(
    ada_model_pipe,
    X_train,
    y_train,
    cv=skf,
    scoring=scoring,
    n_jobs=-1
)

ada_model_pipe.fit(X_train, y_train)
y_ada_model_pred = ada_model_pipe.predict(X_train)

print("\nPerformance Metrics:")
print(f"Train Accuracy: {accuracy_score(y_true=y_train, y_pred=y_ada_model_pred)}")
print(f"CV Accuracy: {np.mean(cv_results['test_accuracy']):.4f}")
print(f"CV F1: {np.mean(cv_results['test_f1']):.4f}")
print(f"CV Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"CV Recall: {np.mean(cv_results['test_recall']):.4f}")

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

xgb_model.fit(X_train_resampled, y_train_resampled)
y_xgb_pred = xgb_model.predict(X_train_resampled)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_xg_acc = cross_val_score(xgb_model, X_train_resampled, y_train_resampled, cv=skf)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_xgb_pred)}")
print(f"Validation Accuracy: {valid_xg_acc.mean()}")

In [None]:
xgb_model_no_early_stop = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    eval_metric='logloss'
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_xg_no_acc = cross_val_score(xgb_model_no_early_stop, X_train_resampled, y_train_resampled, cv=skf)

xgb_model_no_early_stop.fit(X_train_resampled, y_train_resampled)
y_xgb_no_pred = xgb_model_no_early_stop.predict(X_train_resampled)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_xgb_no_pred)}")
print(f"Validation Accuracy: {valid_xg_no_acc.mean()}")

In [None]:
y_test_pred = xgb_model_no_early_stop.predict(X_test_preprocessor)
y_test_proba = xgb_model_no_early_stop.predict_proba(X_test_preprocessor)[:, 1]

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_curve, roc_curve, auc

# Accuracy
test_acc = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_acc:.3f}")

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_test_proba)
pr_auc = auc(recall, precision)

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_test_proba)
roc_auc = auc(fpr, tpr)

In [None]:
y_vald_xgb_pred = cross_val_predict(xgb_model_no_early_stop, X_train_resampled, y_train_resampled, cv=skf)
confusion_matrix(y_train_resampled, y_vald_xgb_pred)

In [None]:
percison_xgb_score = precision_score(y_true=y_train_resampled, y_pred=y_vald_xgb_pred)
recall_xgb_score = recall_score(y_true=y_train_resampled, y_pred=y_vald_xgb_pred)
f1_xgb_score = f1_score(y_true=y_train_resampled, y_pred=y_vald_xgb_pred)
print(f"Precision Score: {percison_xgb_score}")
print(f"Recall Score: {recall_xgb_score}")
print(f"F1 Score: {f1_xgb_score}")

In [None]:
import xgboost
from xgboost import XGBClassifier

xgb_model_pipe = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('xgb', XGBClassifier(
    n_estimators=250,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    eval_metric='logloss'
))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'precision', 'recall']
cv_results = cross_validate(
    xgb_model_pipe,
    X_train,
    y_train,
    cv=skf,
    scoring=scoring,
    n_jobs=-1
)

xgb_model_pipe.fit(X_train, y_train)
y_xgb_model_pred = xgb_model_pipe.predict(X_train)
y_val_pred = xgb_model_pipe.predict(X_val)

print("\nPerformance Metrics:")
print(f"Train Accuracy: {accuracy_score(y_true=y_train, y_pred=y_xgb_model_pred)}")
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"CV Accuracy: {np.mean(cv_results['test_accuracy']):.4f}")
print(f"CV F1: {np.mean(cv_results['test_f1']):.4f}")
print(f"CV Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"CV Recall: {np.mean(cv_results['test_recall']):.4f}")

In [None]:
import xgboost
from xgboost import XGBClassifier

xgb_model_pipe = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('xgb', XGBClassifier(
        n_estimators=250,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=1,
        reg_alpha=0.1,
        reg_lambda=1,
        random_state=42,
        eval_metric='logloss',
        early_stopping_rounds=10  # Early stopping enabled
    ))
])

# Preprocess validation data using the pipeline's components
X_val_preprocessed = xgb_model_pipe.named_steps['preprocessor'].transform(X_val)

# Fit the model with early stopping
xgb_model_pipe.fit(
    X_train, 
    y_train,
    xgb__eval_set=[(X_val_preprocessed, y_val)]  # Use preprocessed validation data
)

# Predictions and metrics (unchanged)
y_xgb_model_pred = xgb_model_pipe.predict(X_train)
y_val_pred = xgb_model_pipe.predict(X_val)

print("\nPerformance Metrics:")
print(f"Train Accuracy: {accuracy_score(y_true=y_train, y_pred=y_xgb_model_pred)}")
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"CV Accuracy: {np.mean(cv_results['test_accuracy']):.4f}")
print(f"CV F1: {np.mean(cv_results['test_f1']):.4f}")
print(f"CV Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"CV Recall: {np.mean(cv_results['test_recall']):.4f}")

In [None]:
from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier(
    n_estimators=150,
    learning_rate=0.05,
    random_state=42,
    max_depth=15,
    num_leaves=33,
    min_child_samples=19,
    subsample=0.7,
    colsample_bytree=0.8,
    verbose=-1
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_lgb_acc = cross_val_score(lgb_model, X_train_resampled, y_train_resampled, cv=skf)

lgb_model.fit(X_train_resampled, y_train_resampled)
y_lgb_best_pred = lgb_model.predict(X_train_resampled)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_lgb_best_pred)}")
print(f"Validation Accuracy: {valid_lgb_acc.mean()}")

In [None]:
import optuna

# Objective function for optimization
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    num_leaves = trial.suggest_int("num_leaves", 20, 100)
    min_child_samples = trial.suggest_int("min_child_samples", 10, 50)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)

    # Initialize model
    model = LGBMClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        num_leaves=num_leaves,
        min_child_samples=min_child_samples,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=42
    )
    
    # Perform cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=skf, scoring='accuracy')
    return scores.mean()

# Run optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Print best parameters
print("Best parameters:", study.best_params)
print("Best score:", study.best_value)


In [None]:
lgb_model_grid = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42,
    max_depth=12,
    num_leaves=83,
    min_child_samples=28,
    subsample=0.8,
    colsample_bytree=0.5,
    verbose=-1
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_lgb_grid_acc = cross_val_score(lgb_model_grid, X_train_resampled, y_train_resampled, cv=skf)

lgb_model_grid.fit(X_train_resampled, y_train_resampled)
y_lgb_grid_pred = lgb_model_grid.predict(X_train_resampled)

print(f"Train Accuracy: {accuracy_score(y_true=y_train_resampled, y_pred=y_lgb_grid_pred)}")
print(f"Validation Accuracy: {valid_lgb_grid_acc.mean()}")

In [None]:
y_vald_lgb_grid_pred = cross_val_predict(lgb_model_grid, X_train_resampled, y_train_resampled, cv=skf)
confusion_matrix(y_train_resampled, y_vald_lgb_grid_pred)

In [None]:
percison_lgb_grid_score = precision_score(y_true=y_train_resampled, y_pred=y_vald_lgb_grid_pred)
recall_lgb_grid_score = recall_score(y_true=y_train_resampled, y_pred=y_vald_lgb_grid_pred)
f1_lgb_grid_score = f1_score(y_true=y_train_resampled, y_pred=y_vald_lgb_grid_pred)
print(f"Precision Score: {percison_lgb_grid_score}")
print(f"Recall Score: {recall_lgb_grid_score}")
print(f"F1 Score: {f1_lgb_grid_score}")

In [None]:
from lightgbm import LGBMClassifier

lgbm_model_pipe = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
        ('lgbm', LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        random_state=42,
        max_depth=6,
        num_leaves=31,
        min_child_samples=28,
        subsample=0.8,
        colsample_bytree=0.5,
        verbose=-1
    ))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'precision', 'recall']
cv_results = cross_validate(
    lgbm_model_pipe,
    X_train,
    y_train,
    cv=skf,
    scoring=scoring,
    n_jobs=-1
)

lgbm_model_pipe.fit(X_train, y_train)
y_lgbm_model_pred = lgbm_model_pipe.predict(X_train)
y_val_pred = lgbm_model_pipe.predict(X_val)

print("\nPerformance Metrics:")
print(f"Train Accuracy: {accuracy_score(y_true=y_train, y_pred=y_lgbm_model_pred)}")
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"CV Accuracy: {np.mean(cv_results['test_accuracy']):.4f}")
print(f"CV F1: {np.mean(cv_results['test_f1']):.4f}")
print(f"CV Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"CV Recall: {np.mean(cv_results['test_recall']):.4f}")

## stacking (voting, stacking)

In [None]:
rfc_model  = RandomForestClassifier(
    n_estimators=400, 
    random_state=42, 
    max_depth=15, 
    min_samples_split=2, 
    min_samples_leaf=4, 
    bootstrap=False)



lgbm_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42,
    max_depth=6,
    num_leaves=31,
    min_child_samples=28,
    subsample=0.8,
    colsample_bytree=0.5,
    verbose=-1
)


xgb_model = XGBClassifier(
    n_estimators=250,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    eval_metric='logloss'
)

voting_clf = VotingClassifier(
    estimators=[('rf', rfc_model), ('lgbm', lgbm_model), ('xgb', xgb_model)],
    voting='soft'
)


voting_pipeline = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('voting', voting_clf) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'precision', 'recall']

cv_results = cross_validate(
    voting_pipeline,
    X_train,
    y_train,
    cv=skf,
    scoring=scoring,
    n_jobs=-1
)

voting_pipeline.fit(X_train, y_train)

y_train_pred = voting_pipeline.predict(X_train)
y_val_pred = voting_pipeline.predict(X_val)

print("\nPerformance Metrics:")
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"CV Accuracy: {np.mean(cv_results['test_accuracy']):.4f}")
print(f"CV F1: {np.mean(cv_results['test_f1']):.4f}")
print(f"CV Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"CV Recall: {np.mean(cv_results['test_recall']):.4f}")

In [None]:
rfc_model_5 = RandomForestClassifier(
    n_estimators=400, 
    random_state=42, 
    max_depth=15, 
    min_samples_split=2, 
    min_samples_leaf=4, 
    bootstrap=False)



lgb_model_grid = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42,
    max_depth=12,
    num_leaves=83,
    min_child_samples=28,
    subsample=0.8,
    colsample_bytree=0.5,
    verbose=-1
)


xgb_model_no_early_stop = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    eval_metric='logloss'
)

voting_clf = VotingClassifier(estimators=[
    ('rf', rfc_model_5),
    ('lgbm', lgb_model_grid),
    ('xgb', xgb_model_no_early_stop)
], voting='soft', weights=[0.6, 0.2, 0.2])


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_voting_acc = cross_val_score(voting_clf, X_train_resampled, y_train_resampled,
                                   cv=skf, scoring='f1', n_jobs=-1)

voting_clf.fit(X_train_resampled, y_train_resampled)
y_voting_pred = voting_clf.predict(X_train_resampled)

train_acc = accuracy_score(y_train_resampled, y_voting_pred)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {valid_voting_acc.mean():.4f}")


In [None]:
y_vald_voting_pred = cross_val_predict(voting_clf, X_train_resampled, y_train_resampled, cv=skf)
confusion_matrix(y_train_resampled, y_vald_voting_pred)

In [None]:
percison_voting_score = precision_score(y_true=y_train_resampled, y_pred=y_vald_voting_pred)
recall_voting_score = recall_score(y_true=y_train_resampled, y_pred=y_vald_voting_pred)
f1_voting_score = f1_score(y_true=y_train_resampled, y_pred=y_vald_voting_pred)
print(f"Precision Score: {percison_voting_score}")
print(f"Recall Score: {recall_voting_score}")
print(f"F1 Score: {f1_voting_score}")

In [None]:
rfc_model  = RandomForestClassifier(
    n_estimators=400, 
    random_state=42, 
    max_depth=15, 
    min_samples_split=2, 
    min_samples_leaf=4, 
    bootstrap=False)



lgbm_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42,
    max_depth=6,
    num_leaves=31,
    min_child_samples=28,
    subsample=0.8,
    colsample_bytree=0.5,
    verbose=-1
)


xgb_model = XGBClassifier(
    n_estimators=250,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    eval_metric='logloss'
)


base_models = [
    ('xgb', xgb_model),
    ('rf', rfc_model),
    ('lgb', lgbm_model)
]

meta_model = VotingClassifier(estimators=base_models, voting='soft')

stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, n_jobs=-1)

stacking_pipeline = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('stacking', stacking_clf) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'precision', 'recall']

cv_results = cross_validate(
    stacking_pipeline,
    X_train,
    y_train,
    cv=skf,
    scoring=scoring,
    n_jobs=-1
)

stacking_pipeline.fit(X_train, y_train)

y_train_pred = stacking_pipeline.predict(X_train)
y_val_pred = stacking_pipeline.predict(X_val)

print("\nPerformance Metrics:")
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"CV Accuracy: {np.mean(cv_results['test_accuracy']):.4f}")
print(f"CV F1: {np.mean(cv_results['test_f1']):.4f}")
print(f"CV Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"CV Recall: {np.mean(cv_results['test_recall']):.4f}")

In [None]:
rfc_model  = RandomForestClassifier(
    n_estimators=400, 
    random_state=42, 
    max_depth=15, 
    min_samples_split=2, 
    min_samples_leaf=4, 
    bootstrap=False)



lgbm_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42,
    max_depth=6,
    num_leaves=31,
    min_child_samples=28,
    subsample=0.8,
    colsample_bytree=0.5,
    verbose=-1
)


xgb_model = XGBClassifier(
    n_estimators=250,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    eval_metric='logloss'
)


base_models = [
    ('xgb', xgb_model),
    ('rf', rfc_model),
    ('lgb', lgbm_model)
]

meta_model = XGBClassifier(
    n_estimators=100, 
    learning_rate=0.05, 
    max_depth=3, 
    random_state=42
)

stacking_clf_xgb = StackingClassifier(estimators=base_models, final_estimator=meta_model, n_jobs=-1)

stacking_pipeline_xgb = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('stacking', stacking_clf_xgb) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'precision', 'recall']

cv_results = cross_validate(
    stacking_pipeline_xgb,
    X_train,
    y_train,
    cv=skf,
    scoring=scoring,
    n_jobs=-1
)

stacking_pipeline_xgb.fit(X_train, y_train)

y_train_pred = stacking_pipeline_xgb.predict(X_train)
y_val_pred = stacking_pipeline_xgb.predict(X_val)

print("\nPerformance Metrics:")
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"CV Accuracy: {np.mean(cv_results['test_accuracy']):.4f}")
print(f"CV F1: {np.mean(cv_results['test_f1']):.4f}")
print(f"CV Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"CV Recall: {np.mean(cv_results['test_recall']):.4f}")

In [None]:
rfc_model  = RandomForestClassifier(
    n_estimators=400, 
    random_state=42, 
    max_depth=15, 
    min_samples_split=2, 
    min_samples_leaf=4, 
    bootstrap=False)



lgbm_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42,
    max_depth=6,
    num_leaves=31,
    min_child_samples=28,
    subsample=0.8,
    colsample_bytree=0.5,
    verbose=-1
)


xgb_model = XGBClassifier(
    n_estimators=250,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    eval_metric='logloss'
)


base_models = [
    ('xgb', xgb_model),
    ('rf', rfc_model),
    ('lgb', lgbm_model)
]

meta_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42,
    max_depth=6,
    num_leaves=31,
    min_child_samples=28,
    subsample=0.8,
    colsample_bytree=0.5,
    verbose=-1
)

stacking_clf_lgb = StackingClassifier(estimators=base_models, final_estimator=meta_model, n_jobs=-1)

stacking_pipeline_lgb = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('stacking', stacking_clf_lgb) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'precision', 'recall']

cv_results = cross_validate(
    stacking_pipeline_lgb,
    X_train,
    y_train,
    cv=skf,
    scoring=scoring,
    n_jobs=-1
)

stacking_pipeline_lgb.fit(X_train, y_train)

y_train_pred = stacking_pipeline_lgb.predict(X_train)
y_val_pred = stacking_pipeline_lgb.predict(X_val)

print("\nPerformance Metrics:")
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"CV Accuracy: {np.mean(cv_results['test_accuracy']):.4f}")
print(f"CV F1: {np.mean(cv_results['test_f1']):.4f}")
print(f"CV Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"CV Recall: {np.mean(cv_results['test_recall']):.4f}")

In [None]:
rfc_model  = RandomForestClassifier(
    n_estimators=400, 
    random_state=42, 
    max_depth=15, 
    min_samples_split=2, 
    min_samples_leaf=4, 
    bootstrap=False)



lgbm_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42,
    max_depth=6,
    num_leaves=31,
    min_child_samples=28,
    subsample=0.8,
    colsample_bytree=0.5,
    verbose=-1
)


xgb_model = XGBClassifier(
    n_estimators=250,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    eval_metric='logloss'
)


base_models = [
    ('xgb', xgb_model),
    ('rf', rfc_model),
    ('lgb', lgbm_model)
]

meta_model = LogisticRegression()


stacking_clf_log = StackingClassifier(estimators=base_models, final_estimator=meta_model, n_jobs=-1)

stacking_pipeline_log = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('stacking', stacking_clf_log) 
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'precision', 'recall']

cv_results = cross_validate(
    stacking_pipeline_log,
    X_train,
    y_train,
    cv=skf,
    scoring=scoring,
    n_jobs=-1
)

stacking_pipeline_log.fit(X_train, y_train)

y_train_pred = stacking_pipeline_log.predict(X_train)
y_val_pred = stacking_pipeline_log.predict(X_val)

print("\nPerformance Metrics:")
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"CV Accuracy: {np.mean(cv_results['test_accuracy']):.4f}")
print(f"CV F1: {np.mean(cv_results['test_f1']):.4f}")
print(f"CV Precision: {np.mean(cv_results['test_precision']):.4f}")
print(f"CV Recall: {np.mean(cv_results['test_recall']):.4f}")

In [None]:
stacking_pipeline_log

In [None]:
base_models = [
    ('xgb', xgb_model_no_early_stop),
    ('rf', rfc_model_5),
    ('lgb', lgb_model_grid)
]

meta_model = LGBMClassifier(n_estimators=100, learning_rate=0.05, random_state=42)


# Stacking Classifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, n_jobs=-1)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_stacking_acc = cross_val_score(stacking_clf, X_train_resampled, y_train_resampled,
                                     cv=skf, scoring='f1', n_jobs=-1)

stacking_clf.fit(X_train_resampled, y_train_resampled)
y_stacking_pred = stacking_clf.predict(X_train_resampled)

train_stacking_lgb_acc = accuracy_score(y_train_resampled, y_stacking_pred)

print(f"Train Accuracy: {train_stacking_lgb_acc:.4f}")
print(f"Validation Accuracy: {valid_stacking_acc.mean():.4f}")

In [None]:
y_vald_stacking_pred = cross_val_predict(stacking_clf, X_train_resampled, y_train_resampled, cv=skf)
confusion_matrix(y_train_resampled, y_vald_stacking_pred)

In [None]:
percison_stacking_score = precision_score(y_true=y_train_resampled, y_pred=y_vald_stacking_pred)
recall_stacking_score = recall_score(y_true=y_train_resampled, y_pred=y_vald_stacking_pred)
f1_stacking_score = f1_score(y_true=y_train_resampled, y_pred=y_vald_stacking_pred)
print(f"Precision Score: {percison_stacking_score}")
print(f"Recall Score: {recall_stacking_score}")
print(f"F1 Score: {f1_stacking_score}")

In [None]:
base_models = [
    ('xgb', xgb_model_no_early_stop),
    ('rf', rfc_model_5),
    ('lgb', lgb_model_grid)
]

meta_model = LogisticRegression(C=1, random_state=42, max_iter=1000)

# Stacking Classifier
stacking_clf_log = StackingClassifier(estimators=base_models, final_estimator=meta_model, n_jobs=-1)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
valid_stacking_log_acc = cross_val_score(stacking_clf_log, X_train_resampled, y_train_resampled,
                                     cv=skf, scoring='f1', n_jobs=-1)

stacking_clf_log.fit(X_train_resampled, y_train_resampled)
y_stacking_log_pred = stacking_clf_log.predict(X_train_resampled)

train_stacking_log_acc = accuracy_score(y_train_resampled, y_stacking_log_pred)

print(f"Train Accuracy: {train_stacking_log_acc:.4f}")
print(f"Validation Accuracy: {valid_stacking_log_acc.mean():.4f}")

In [None]:
y_vald_stacking_log_pred = cross_val_predict(stacking_clf_log, X_train_resampled, y_train_resampled, cv=skf)
confusion_matrix(y_train_resampled, y_vald_stacking_log_pred)

In [None]:
percison_stacking_log_score = precision_score(y_true=y_train_resampled, y_pred=y_vald_stacking_log_pred)
recall_stacking_log_score = recall_score(y_true=y_train_resampled, y_pred=y_vald_stacking_log_pred)
f1_stacking_log_score = f1_score(y_true=y_train_resampled, y_pred=y_vald_stacking_log_pred)
print(f"Precision Score: {percison_stacking_log_score}")
print(f"Recall Score: {recall_stacking_log_score}")
print(f"F1 Score: {f1_stacking_log_score}")

In [None]:
def objective(trial):
    # Base models hyperparameters
    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
        'learning_rate': trial.suggest_float('xgb_learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('xgb_subsample', 0.5, 1.0)
    }

    rf_params = {
        'n_estimators': trial.suggest_int('rf_n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('rf_max_depth', 5, 15),
        'min_samples_split': trial.suggest_int('rf_min_samples_split', 2, 10)
    }

    lgb_params = {
        'n_estimators': trial.suggest_int('lgb_n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('lgb_learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('lgb_num_leaves', 31, 150)
    }

    # Meta-model parameters
    logreg_params = {
        'C': trial.suggest_float('logreg_C', 0.01, 10.0),
        'max_iter': trial.suggest_int('logreg_max_iter', 100, 1000)
    }

    # Build models
    xgb = XGBClassifier(**xgb_params, random_state=42)
    rf = RandomForestClassifier(**rf_params, random_state=42)
    lgb = LGBMClassifier(**lgb_params, random_state=42)

    base_models = [
        ('xgb', xgb),
        ('rf', rf),
        ('lgb', lgb)
    ]

    meta_model = LogisticRegression(**logreg_params, random_state=42)

    stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, n_jobs=-1)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1 = cross_val_score(stacking_clf, X_train_resampled, y_train_resampled, cv=skf, scoring='f1', n_jobs=-1).mean()

    return f1

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print best results
print("Best F1 Score:", study.best_value)
print("Best Params:", study.best_params)

In [None]:
best_params = study.best_params
xgb_best = XGBClassifier(n_estimators=best_params['xgb_n_estimators'],
                         max_depth=best_params['xgb_max_depth'],
                         learning_rate=best_params['xgb_learning_rate'],
                         subsample=best_params['xgb_subsample'],
                         random_state=42)

rf_best = RandomForestClassifier(n_estimators=best_params['rf_n_estimators'],
                                 max_depth=best_params['rf_max_depth'],
                                 min_samples_split=best_params['rf_min_samples_split'],
                                 random_state=42)

lgb_best = LGBMClassifier(n_estimators=best_params['lgb_n_estimators'],
                          learning_rate=best_params['lgb_learning_rate'],
                          num_leaves=best_params['lgb_num_leaves'],
                          random_state=42)

meta_best = LogisticRegression(C=best_params['logreg_C'],
                                max_iter=best_params['logreg_max_iter'],
                                random_state=42)

# Final Stacking Model
final_stacking_clf = StackingClassifier(estimators=[('xgb', xgb_best), ('rf', rf_best), ('lgb', lgb_best)],
                                        final_estimator=meta_best, n_jobs=-1)

final_stacking_clf.fit(X_train_resampled, y_train_resampled)

print("Final Model Trained with Optimized Hyperparameters!")

In [None]:
y_stacking_final_pred = final_stacking_clf.predict(X_train_resampled)
valid_stacking_final_acc = cross_val_score(final_stacking_clf, X_train_resampled, y_train_resampled,
                                     cv=skf, scoring='f1', n_jobs=-1)
y_vald_stacking_final_pred = cross_val_predict(final_stacking_clf, X_train_resampled, y_train_resampled, cv=skf)
train_stacking_final_acc = accuracy_score(y_train_resampled, y_stacking_final_pred)
percison_stacking_final_score = precision_score(y_true=y_train_resampled, y_pred=y_vald_stacking_final_pred)
recall_stacking_final_score = recall_score(y_true=y_train_resampled, y_pred=y_vald_stacking_final_pred)
f1_stacking_final_score = f1_score(y_true=y_train_resampled, y_pred=y_vald_stacking_final_pred)

print(f"Precision Score: {percison_stacking_final_score}")
print(f"Recall Score: {recall_stacking_final_score}")
print(f"F1 Score: {f1_stacking_final_score}")
print(f"Train Accuracy: {train_stacking_final_acc:.4f}")
print(f"Validation Accuracy: {valid_stacking_final_acc.mean():.4f}")

# over sample vs under sample

In [None]:
def evaluate_model(model, sampler, X_train, y_train, X_test, y_test):
    pipeline = ImbPipeline([
        ('preprocessor', processor),
        ('sampler', sampler),
        ('classifier', model)
    ])

    pipeline.fit(X_train, y_train)

    # Predict on the test set
    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])

    return accuracy, precision, recall, f1, roc_auc

In [None]:
# List of models and sampling techniques
models = [xgb_model_no_early_stop, rfc_model_5, lgb_model_grid, voting_clf, stacking_clf]
samplers = [
    SMOTE(random_state=42, sampling_strategy=0.5),
    RandomUnderSampler(random_state=42)
]
sampler_names = ['SMOTE', 'Undersampling']

# Evaluate and store results
results = []

for model in models:
    for sampler, sampler_name in zip(samplers, sampler_names):
        accuracy, precision, recall, f1, roc_auc = evaluate_model(model, sampler, X_train, y_train, X_test, y_test)
        results.append({
            'Model': model.__class__.__name__,
            'Sampler': sampler_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'ROC-AUC': roc_auc
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)
results_df

# models comparison

In [None]:
models = {
    'Random Forest': rfc_model_5,
    'lgbm': lgb_model_grid,
    'XGBoost': xgb_model_no_early_stop,
    'Voting': voting_clf,
    'Stacking_lgb': stacking_clf_log

}

# Ensure models are trained
for name, model in models.items():
    if not hasattr(model, "fit"):  # Check if model supports fitting
        continue
    print(f"Training {name}...")
    model.fit(X_train_resampled, y_train_resampled)

print("All models trained. Proceeding with evaluation.")

# Colors for plotting
colors = ['b', 'g', 'r', 'c', 'm']

# Figure setup
plt.figure(figsize=(12, 5))

### 📌 Step 1: Plot ROC Curves ###
plt.subplot(1, 2, 1)
for (name, model), color in zip(models.items(), colors):
    # Predict probabilities
    y_proba = model.predict_proba(X_test_preprocessor)[:, 1]

    # Compute ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, color=color, lw=2, label=f"{name} (AUC = {roc_auc:.3f})")

plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()

### 📌 Step 2: Plot PR Curves ###
plt.subplot(1, 2, 2)
for (name, model), color in zip(models.items(), colors):
    # Predict probabilities
    y_proba = model.predict_proba(X_test_preprocessor)[:, 1]

    # Compute Precision-Recall curve
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    pr_auc = auc(recall, precision)

    plt.plot(recall, precision, color=color, lw=2, label=f"{name} (AUC = {pr_auc:.3f})")

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()

In [None]:
y_valid_voting_prob = cross_val_predict(voting_clf, X_train_resampled, y_train_resampled, cv=3, method='predict_proba')
y_valid_stacking_prob = cross_val_predict(stacking_clf_log, X_train_resampled, y_train_resampled, cv=3, method='predict_proba')
y_valid_xgb_prob = cross_val_predict(xgb_model_no_early_stop, X_train_resampled, y_train_resampled, cv=3, method='predict_proba')
y_valid_rf_prob = cross_val_predict(rfc_model_5, X_train_resampled, y_train_resampled, cv=3, method='predict_proba')
y_valid_lgb_prob = cross_val_predict(lgb_model_grid, X_train_resampled, y_train_resampled, cv=3, method='predict_proba')
y_valid_gb_prob = cross_val_predict(gb_model_3, X_train_resampled, y_train_resampled, cv=3, method='predict_proba')

precision_rf, recall_rf, rf_thresholds = precision_recall_curve(y_train_resampled, y_valid_rf_prob[:, 1])
precision_xgb, recall_xgb, xgb_thresholds = precision_recall_curve(y_train_resampled, y_valid_xgb_prob[:, 1])
precision_lgb, recall_lgb, lgb_thresholds = precision_recall_curve(y_train_resampled, y_valid_lgb_prob[:, 1])
precision_voting, recall_voting, voting_thresholds = precision_recall_curve(y_train_resampled, y_valid_voting_prob[:, 1])
precision_stacking, recall_stacking, stacking_thresholds = precision_recall_curve(y_train_resampled, y_valid_stacking_prob[:, 1])
precision_gb, recall_gb, gb_thresholds = precision_recall_curve(y_train_resampled, y_valid_gb_prob[:, 1])

plt.figure(figsize=(8, 6))
plt.plot(recall_rf[:-1], precision_rf[:-1], label='Random Forest', color='red')
plt.plot(recall_xgb[:-1], precision_xgb[:-1], label='XGBoost', color='green')
plt.plot(recall_lgb[:-1], precision_lgb[:-1], label='LightGBM', color='purple')
plt.plot(recall_voting[:-1], precision_voting[:-1], label='Voting Classifier', color='orange')
plt.plot(recall_stacking[:-1], precision_stacking[:-1], label='Stacking Classifier', color='brown')
plt.plot(recall_gb[:-1], precision_gb[:-1], label='GB Classifier', color='yellow')


plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves for Different Models')
plt.legend()
plt.grid()
plt.show()

In [None]:
fpr_rf, tpr_rf, _ = roc_curve(y_train_resampled, y_valid_rf_prob[:, 1])
fpr_xgb, tpr_xgb, _ = roc_curve(y_train_resampled, y_valid_xgb_prob[:, 1])
fpr_lgb, tpr_lgb, _ = roc_curve(y_train_resampled, y_valid_lgb_prob[:, 1])
fpr_voting, tpr_voting, _ = roc_curve(y_train_resampled, y_valid_voting_prob[:, 1])
fpr_stacking, tpr_stacking, _ = roc_curve(y_train_resampled, y_valid_stacking_prob[:, 1])
fpr_gb, tpr_gb, _ = roc_curve(y_train_resampled, y_valid_gb_prob[:, 1])


auc_rf = auc(fpr_rf, tpr_rf)
auc_xgb = auc(fpr_xgb, tpr_xgb)
auc_lgb = auc(fpr_lgb, tpr_lgb)
auc_voting = auc(fpr_voting, tpr_voting)
auc_stacking = auc(fpr_stacking, tpr_stacking)
auc_gb = auc(fpr_gb, tpr_gb)


plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {auc_rf:.3f})', color='red')
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {auc_xgb:.3f})', color='green')
plt.plot(fpr_lgb, tpr_lgb, label=f'LightGBM (AUC = {auc_lgb:.3f})', color='purple')
plt.plot(fpr_voting, tpr_voting, label=f'Voting (AUC = {auc_voting:.3f})', color='orange')
plt.plot(fpr_stacking, tpr_stacking, label=f'Stacking (AUC = {auc_stacking:.3f})', color='brown')
plt.plot(fpr_gb, tpr_gb, label=f'Gb (AUC = {auc_gb:.3f})', color='yellow')

plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Different Models')
plt.legend()
plt.grid()
plt.show()


In [None]:
y_valid_voting_prob = cross_val_predict(voting_pipeline, X_train, y_train, cv=3, method='predict_proba')
y_valid_stacking_xgb_prob = cross_val_predict(stacking_pipeline_xgb, X_train, y_train, cv=3, method='predict_proba')
y_valid_stacking_log_prob = cross_val_predict(stacking_pipeline_log, X_train, y_train, cv=3, method='predict_proba')
y_valid_xgb_prob = cross_val_predict(xgb_model_pipe, X_train, y_train, cv=3, method='predict_proba')
y_valid_rf_prob = cross_val_predict(rfc_model_5_pipe, X_train, y_train, cv=3, method='predict_proba')
y_valid_lgb_prob = cross_val_predict(lgbm_model_pipe, X_train, y_train, cv=3, method='predict_proba')
y_valid_gb_prob = cross_val_predict(gb_model_pipe, X_train, y_train, cv=3, method='predict_proba')

precision_rf, recall_rf, rf_thresholds = precision_recall_curve(y_train, y_valid_rf_prob[:, 1])
precision_xgb, recall_xgb, xgb_thresholds = precision_recall_curve(y_train, y_valid_xgb_prob[:, 1])
precision_lgb, recall_lgb, lgb_thresholds = precision_recall_curve(y_train, y_valid_lgb_prob[:, 1])
precision_voting, recall_voting, voting_thresholds = precision_recall_curve(y_train, y_valid_voting_prob[:, 1])
precision_stacking_xgb, recall_stacking_xgb, stacking_thresholds_xgb = precision_recall_curve(y_train, y_valid_stacking_log_prob[:, 1])
precision_stacking, recall_stacking, stacking_thresholds = precision_recall_curve(y_train, y_valid_lgb_prob[:, 1])
precision_gb, recall_gb, gb_thresholds = precision_recall_curve(y_train, y_valid_gb_prob[:, 1])

plt.figure(figsize=(8, 6))
plt.plot(recall_rf[:-1], precision_rf[:-1], label='Random Forest', color='red')
plt.plot(recall_xgb[:-1], precision_xgb[:-1], label='XGBoost', color='green')
plt.plot(recall_lgb[:-1], precision_lgb[:-1], label='LightGBM', color='purple')
plt.plot(recall_voting[:-1], precision_voting[:-1], label='Voting Classifier', color='orange')
plt.plot(recall_stacking[:-1], precision_stacking[:-1], label='Stacking Classifier', color='brown')
plt.plot(recall_stacking_xgb[:-1], precision_stacking_xgb[:-1], label='Stacking Classifier', color='blue')
plt.plot(recall_gb[:-1], precision_gb[:-1], label='GB Classifier', color='yellow')


plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves for Different Models')
plt.legend()
plt.grid()
plt.show()

In [None]:
fpr_rf, tpr_rf, _ = roc_curve(y_train, y_valid_rf_prob[:, 1])
fpr_xgb, tpr_xgb, _ = roc_curve(y_train, y_valid_xgb_prob[:, 1])
fpr_lgb, tpr_lgb, _ = roc_curve(y_train, y_valid_lgb_prob[:, 1])
fpr_voting, tpr_voting, _ = roc_curve(y_train, y_valid_voting_prob[:, 1])
fpr_stacking, tpr_stacking, _ = roc_curve(y_train, y_valid_stacking_log_prob[:, 1])
fpr_stacking_xgb, tpr_stacking_xgb, _ = roc_curve(y_train, y_valid_stacking_xgb_prob[:, 1])
fpr_gb, tpr_gb, _ = roc_curve(y_train, y_valid_gb_prob[:, 1])


auc_rf = auc(fpr_rf, tpr_rf)
auc_xgb = auc(fpr_xgb, tpr_xgb)
auc_lgb = auc(fpr_lgb, tpr_lgb)
auc_voting = auc(fpr_voting, tpr_voting)
auc_stacking = auc(fpr_stacking, tpr_stacking)
auc_stacking_xgb = auc(fpr_stacking_xgb, tpr_stacking_xgb)
auc_gb = auc(fpr_gb, tpr_gb)


plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {auc_rf:.3f})', color='red')
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {auc_xgb:.3f})', color='green')
plt.plot(fpr_lgb, tpr_lgb, label=f'LightGBM (AUC = {auc_lgb:.3f})', color='purple')
plt.plot(fpr_voting, tpr_voting, label=f'Voting (AUC = {auc_voting:.3f})', color='orange')
plt.plot(fpr_stacking, tpr_stacking, label=f'Stacking (AUC = {auc_stacking:.3f})', color='brown')
plt.plot(fpr_stacking_xgb, tpr_stacking_xgb, label=f'Stacking (AUC = {auc_stacking_xgb:.3f})', color='blue')
plt.plot(fpr_gb, tpr_gb, label=f'Gb (AUC = {auc_gb:.3f})', color='yellow')

plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Different Models')
plt.legend()
plt.grid()
plt.show()


# ROC, precision and recall Curve

In [None]:
y_valid_voting_pred = cross_val_predict(voting_pipeline, X_train, y_train, cv=3, method='predict')
confusion_matrix(y_train, y_valid_voting_pred)

In [None]:
percison_rfc5_score = precision_score(y_true=y_train, y_pred=y_valid_voting_prob)
recall_rfc5_score = recall_score(y_true=y_train, y_pred=y_valid_voting_prob)
f1_rfc5_score = f1_score(y_true=y_train, y_pred=y_valid_voting_prob)
print(f"Precision Score: {percison_rfc5_score}")
print(f"Recall Score: {recall_rfc5_score}")
print(f"F1 Score: {f1_rfc5_score}")

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_valid_voting_prob = cross_val_predict(voting_pipeline, X_train, y_train, cv=3, method='predict_proba')

precision_voting, recall_voting, voting_thresholds = precision_recall_curve( y_train, y_valid_voting_prob[:, 1])

plt.figure(figsize=(8, 6))
plt.plot(recall_voting, precision_voting, label='voting Classifier', color='orange')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for voting Model')
plt.legend()
plt.grid()
plt.show()

In [None]:
y_scores = y_valid_voting_prob[:, 1]

# Compute precision, recall, and thresholds
precision_voting, recall_voting, voting_thresholds = precision_recall_curve(y_train, y_scores)

# Calculate F1 scores for each threshold
f1_scores = [f1_score(y_train, (y_scores >= t).astype(int)) for t in voting_thresholds]

# Find the threshold that maximizes the F1 score
best_threshold_idx = np.argmax(f1_scores)
best_threshold = voting_thresholds[best_threshold_idx]
best_f1 = f1_scores[best_threshold_idx]

print(f"Best Threshold (F1 Score): {best_threshold:.4f}")
print(f"Best F1 Score: {best_f1:.4f}")

In [None]:
y_pred_best = (y_scores >= best_threshold).astype(int)

# Compute precision and recall at the best threshold
precision_best = precision_score(y_train, y_pred_best)
recall_best = recall_score(y_train, y_train)

print(f"Precision at Best Threshold: {precision_best:.4f}")
print(f"Recall at Best Threshold: {recall_best:.4f}")

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(recall_voting, precision_voting, label='Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Voting Model')
plt.grid(True)
plt.legend()

# Plot the best threshold point
plt.scatter([recall_voting[best_threshold_idx]], [precision_voting[best_threshold_idx]], color='red', s=100, label=f'Best Threshold (F1={best_f1:.2f})')
plt.legend()
plt.show()

In [None]:
recall_voting[best_threshold_idx]

In [None]:
precision_voting, recall_voting, thresholds = precision_recall_curve(y_train, y_scores)

target_recall = 0.42
threshold_index = np.argmin(np.abs(recall_voting - target_recall))
best_recal_threshold = thresholds[threshold_index]

print(f"Threshold for recall ~ {target_recall}: {best_recal_threshold:.4f}")

y_pred = (y_valid_voting_prob[:, 1] >= best_recal_threshold).astype(int)

conf_matrix = confusion_matrix(y_train, y_pred)
precision = precision_score(y_train, y_pred)
recall = recall_score(y_train, y_pred)
f1 = f1_score(y_train, y_pred)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
conf_matrix

In [None]:
optimal_idx = max(range(len(recall_voting)), key=lambda i: recall_voting[i] >= 0.42 and precision_voting[i])
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal Threshold for Recall ≥ 0.83: {optimal_threshold:.2f}")
print(f"Precision at this threshold: {precision_voting[optimal_idx]:.2f}")
print(f"Recall at this threshold: {recall_voting[optimal_idx]:.2f}")

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=[0, 1])
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix at Recall = {target_recall}')
plt.show()

# Test Performance

In [None]:
model_pipeline = ImbPipeline([
    ('preprocessor', processor),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('best_model', voting_clf)
])

In [None]:
y_voting_test_pred = model_pipeline.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_voting_test_pred)}") # Test accuracy
print(f"Test F1 Score: {f1_score(y_test, y_voting_test_pred)}") # Test F1
print(f"Test Precsion: {precision_score(y_test, y_voting_test_pred)}") # Test Precsion
print(f"Test Recall: {recall_score(y_test, y_voting_test_pred)}") # Test Recall
confusion_matrix(y_test, y_voting_test_pred)

In [None]:
y_test_proba = model_pipeline.predict_proba(X_test)[:, 1]

best_recall_threshold = 0.175
y_test_pred = (y_test_proba >= best_recall_threshold).astype(int)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print("Confusion Matrix:")
print(test_conf_matrix)

# save model

In [None]:
joblib.dump(model_pipeline, 'model_pipeline.pkl')

In [None]:
joblib.dump(model_pipeline, 'model_pipeline.pkl', compress=3)

In [None]:
import json

with open("best_threshold.json", "w") as f:
    json.dump({"best_threshold": best_recall_threshold}, f)

In [None]:
# Thanks being her till the end of this presentation​

# Check our linkedIN: https://www.linkedin.com/in/mostafa-saad-7a6b8a30b/​

# Check my github: https://github.com/mostafa-s-mostafa/Final-project​

# Check the depoyment: ​https://mostafa-final-project-deployment.streamlit.app/