In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/content/drive/My Drive/cosc522/finalproject/data.csv')

In [None]:
X = df.drop(columns=['Bankrupt?'])
y = df['Bankrupt?']

print(X.shape)

Split the dataset

In [None]:
from sklearn.model_selection import train_test_split

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Original class distribution in training set:")
print(y_train.value_counts())

# Pre-process

## Correlation analyze

In [None]:
""" Analyze the Correlation between features """

X_corr = X.corr()

# filter features that has correlation > 0.5
plt.figure(figsize=(14, 12))
filtered_X_corr = X_corr[(X_corr >= 0.8) | (X_corr <= -0.8)]
sns.heatmap(X_corr, annot=True,
            cmap='coolwarm',
            vmin = -1,
            vmax = 1,
            linewidths= 0.5,
            annot_kws={"size":1},
            cbar = True,
            xticklabels=2,
            yticklabels=2)
plt.show()
plt.savefig('/content/drive/My Drive/cosc522/finalproject/corr.png')

In [None]:
import pandas as pd

threshold = 0.8


high_corr_pairs = X_corr.where((X_corr > threshold) & (X_corr != 1)).stack().reset_index()
high_corr_pairs.columns = ['Feature_1', 'Feature_2', 'Correlation']


print(high_corr_pairs)
print(high_corr_pairs.shape)

## Data Standardization

### StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd


# initial StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

# initial MinMaxScaler
scaler = MinMaxScaler()


X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

### RobustScaler

In [None]:
from sklearn.preprocessing import RobustScaler

# initial RobustScaler
scaler = RobustScaler()

X_train_robust = scaler.fit_transform(X_train)
X_test_robust = scaler.transform(X_test)

## Feature processing

### PCA

In [None]:
from sklearn.decomposition import PCA


high_corr_features = list(set(high_corr_pairs['Feature_1']).union(set(high_corr_pairs['Feature_2'])))
print(len(high_corr_features))
# combine 32 feature to 1
# X_for_pca = X_train.copy()

PCA_list = [1, 2, 4, 8, 16]
X_pca_list = []
XT_pca_list = []


for n_comp in PCA_list:
    X_for_pca = X_train.copy()
    XT_for_pca = X_test.copy()


    pca = PCA(n_components=n_comp)

    X_train_pca = pca.fit_transform(X_for_pca[high_corr_features])

    X_test_pca = pca.transform(XT_for_pca[high_corr_features])

    X_train_pca_df = pd.DataFrame(X_train_pca, columns=[f'PCA_{i+1}' for i in range(X_train_pca.shape[1])], index=X_for_pca.index)
    X_test_pca_df = pd.DataFrame(X_test_pca, columns=[f'PCA_{i+1}' for i in range(X_test_pca.shape[1])], index=XT_for_pca.index)

    X_train_final = X_for_pca.drop(columns=high_corr_features).join(X_train_pca_df)
    X_test_final = XT_for_pca.drop(columns=high_corr_features).join(X_test_pca_df)

    print(f"PCA with {n_comp} components:")
    print("Training set shape after PCA:", X_train_final.shape)
    print("Test set shape after PCA:", X_test_final.shape)

    X_pca_list.append(X_train_final)
    XT_pca_list.append(X_test_final)

### 95% pca on high corr

In [None]:
from sklearn.decomposition import PCA

high_corr_features = list(set(high_corr_pairs['Feature_1']).union(set(high_corr_pairs['Feature_2'])))
print(len(high_corr_features))
# combine 32 feature to 1
# X_for_pca = X_train.copy()

X_for_pca = X_train.copy()
XT_for_pca = X_test.copy()

pca = PCA(n_components=0.95)

X_train_pca = pca.fit_transform(X_for_pca[high_corr_features])

X_test_pca = pca.transform(XT_for_pca[high_corr_features])

X_train_pca_df = pd.DataFrame(X_train_pca, columns=[f'PCA_{i+1}' for i in range(X_train_pca.shape[1])], index=X_for_pca.index)
X_test_pca_df = pd.DataFrame(X_test_pca, columns=[f'PCA_{i+1}' for i in range(X_test_pca.shape[1])], index=XT_for_pca.index)

X_pca_list_95 = X_for_pca.drop(columns=high_corr_features).join(X_train_pca_df)
XT_pca_list_95 = XT_for_pca.drop(columns=high_corr_features).join(X_test_pca_df)

print(f"PCA with {n_comp} components:")
print("Training set shape after PCA:", X_pca_list_95.shape)
print("Test set shape after PCA:", XT_pca_list_95.shape)

In [None]:
cumulative_variance = pca.explained_variance_ratio_.cumsum()

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')
plt.axhline(y=0.95, color='r', linestyle='--', label="95% Variance")
plt.axvline(x=pca.n_components_, color='r', linestyle='--', label=f"{pca.n_components_} Components")
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.legend()
plt.show()

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

class_counts = y_train.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]

smote = SMOTE(random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_pca_list_95)
X_test_pca = scaler.transform(XT_pca_list_95)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

model = XGBClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_pca)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

class_counts = y_train.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]

smote = SMOTE(random_state=42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_pca_list_95)
X_test_pca = scaler.transform(XT_pca_list_95)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

model = XGBClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_pca)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

class_counts = y_train.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]

smote = SMOTE(random_state=42)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_pca_list_95)
X_test_pca = scaler.transform(XT_pca_list_95)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

model = XGBClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_pca)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### StandarScaler PCA in full feature

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=None)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = explained_variance_ratio.cumsum()

print("Explained Variance Ratio for each component:")
print(explained_variance_ratio)

print("Cumulative Explained Variance:")
print(cumulative_variance)

print(f"Number of components to retain 95% variance: {n_components}")

pca = PCA(n_components=n_components)
X_train_pca_full = pca.fit_transform(X_train_scaled)
X_test_pca_full = pca.transform(X_test_scaled)

print(f"Shape after PCA: X_train {X_train_pca.shape}, X_test {X_test_pca.shape}")

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

class_counts = y_train.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]
smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_pca_full, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

model = XGBClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_pca_full)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

###

### MinMaxScaler PCA in full features

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


pca = PCA(n_components=None)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = explained_variance_ratio.cumsum()

print("Explained Variance Ratio for each component:")
print(explained_variance_ratio)

print("Cumulative Explained Variance:")
print(cumulative_variance)

n_components = sum(cumulative_variance < 0.95) + 1
print(f"Number of components to retain 95% variance: {n_components}")

pca = PCA(n_components=n_components)
X_train_pca_full = pca.fit_transform(X_train_scaled)
X_test_pca_full = pca.transform(X_test_scaled)

print(f"Shape after PCA: X_train {X_train_pca.shape}, X_test {X_test_pca.shape}")

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

class_counts = y_train.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_pca_full, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

model = XGBClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_pca_full)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### RobustScaler

In [None]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=None)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = explained_variance_ratio.cumsum()

print("Explained Variance Ratio for each component:")
print(explained_variance_ratio)

print("Cumulative Explained Variance:")
print(cumulative_variance)

n_components = sum(cumulative_variance < 0.95) + 1
print(f"Number of components to retain 95% variance: {n_components}")

pca = PCA(n_components=n_components)
X_train_pca_full = pca.fit_transform(X_train_scaled)
X_test_pca_full = pca.transform(X_test_scaled)

print(f"Shape after PCA: X_train {X_train_pca.shape}, X_test {X_test_pca.shape}")

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

class_counts = y_train.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]
smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_pca_full, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

model = XGBClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_pca_full)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classifiy

## XGBoost

### only XGBoost

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb))

### SMOTE+XGBoost

In [None]:
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

y_pred_xgb = xgb_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb))

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
}

grid_search = GridSearchCV(estimator=XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
                           param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)
print("Classification Report with Best Parameters:")
print(classification_report(y_test, y_pred_best))

### StandardScaler+SMOTE+XGBoost

In [None]:
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

y_pred_xgb = xgb_model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb))

### MinMaxScaler+SMOTE+XGBoost

In [None]:
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_normalized, y_train)
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

y_pred_xgb = xgb_model.predict(X_test_normalized)
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb))

### RobustScaler+SMOTE+XGBoost

In [None]:
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_robust, y_train)
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

y_pred_xgb = xgb_model.predict(X_test_robust)
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb))

### PCA+SMOTE+XGBoost

In [None]:

smote = SMOTE(random_state=42)


for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")


    # scaler = StandardScaler()
    # X_train_scaled = scaler.fit_transform(X_pca_output)
    # X_test_pca = scaler.transform(XT_pca_list[i])


    X_train_resampled, y_train_resampled = smote.fit_resample(X_pca_output, y_train)


    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())


    model = XGBClassifier(random_state=42)
    model.fit(X_train_resampled, y_train_resampled)


    y_pred = model.predict(XT_pca_list[i])


    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

### PCA+StandardScaler+SMOTE+XGBoost

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

class_counts = y_train.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]

smote = SMOTE(random_state=42)


for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")


    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_pca_output)
    X_test_pca = scaler.transform(XT_pca_list[i])

    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = XGBClassifier(random_state=42)
    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(X_test_pca)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

### PCA+MinMaxScaler+SMOTE+XGBoost

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTE(random_state=42)

for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")

    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_pca_output)
    X_test_pca = scaler.transform(XT_pca_list[i])

    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = XGBClassifier(random_state=42)
    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(X_test_pca)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

### PCA+RobustScaler+SMOTE+XGBoost

In [None]:

smote = SMOTE(random_state=42)

for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")

    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_pca_output)
    X_test_pca = scaler.transform(XT_pca_list[i])

    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = XGBClassifier(random_state=42)
    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(X_test_pca)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

### SMOTEENN

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTEENN(random_state=42)

for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_pca_output)
    X_test_pca = scaler.transform(XT_pca_list[i])

    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = XGBClassifier(random_state=42)
    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(X_test_pca)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

## RandomForestClassifier

### Only RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### SMOTE + RandomForestClassifier

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### StanderScaler+SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### MinMaxScaler+SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_normalized, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_normalized)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### RobustScaler+SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_robust, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_robust)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### PCA+SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

smote = SMOTE(random_state=42)

for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")

    X_train_resampled, y_train_resampled = smote.fit_resample(X_pca_output, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_resampled, y_train_resampled)

    X_test_pca = XT_pca_list[i]

    y_pred = model.predict(X_test_pca)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

### StandardScaler+SMOTE+RandomForestClassifier

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_scaled_df, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

## Logistic Regression

### LR

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler


model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### NONE+SMOTE+Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### StandardScaler+SMOTE+LR

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

model = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')

model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### MinMaxScale+SMOTE+LR

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_normalized, y_train)

model = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')

model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_normalized)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### RobustScaler+SMOTE+LR

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_robust, y_train)

model = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')

model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_robust)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### PCA+SMOTE+LR

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

smote = SMOTE(random_state=42)

for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")

    X_train_resampled, y_train_resampled = smote.fit_resample(X_pca_output, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')
    model.fit(X_train_resampled, y_train_resampled)

    X_test_pca = XT_pca_list[i]

    y_pred = model.predict(X_test_pca)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

### PCA+StandardScaler+SMOTE+LR

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTE(random_state=42)

for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")

    scaler = StandardScaler()
    X_pca_output_scaled = scaler.fit_transform(X_pca_output)
    X_test_pca_scaled = scaler.transform(XT_pca_list[i])

    X_train_resampled, y_train_resampled = smote.fit_resample(X_pca_output_scaled, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')

    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(X_test_pca_scaled)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

### PCA+MinMaxScale+SMOTE+LR

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTE(random_state=42)

for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")

    scaler = MinMaxScaler()
    X_pca_output_scaled = scaler.fit_transform(X_pca_output)
    X_test_pca_scaled = scaler.transform(XT_pca_list[i])

    X_train_resampled, y_train_resampled = smote.fit_resample(X_pca_output_scaled, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')

    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(X_test_pca_scaled)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

### PCA+RobustScale+SMOTE+LR

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTE(random_state=42)

for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")

    scaler = RobustScaler()
    X_pca_output_scaled = scaler.fit_transform(X_pca_output)
    X_test_pca_scaled = scaler.transform(XT_pca_list[i])

    X_train_resampled, y_train_resampled = smote.fit_resample(X_pca_output_scaled, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')

    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(X_test_pca_scaled)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

## MLP

### MPL

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

model = MLPClassifier(
    hidden_layer_sizes=(128, 32),
    activation='relu',
    solver='adam',
    max_iter=300,
    random_state=42
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### StandardScaler+mlp

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_pca = scaler.transform(X_test)

model = MLPClassifier(
    hidden_layer_sizes=(128, 32),
    activation='relu',
    solver='adam',
    max_iter=300,
    random_state=42
)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_pca)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### MinMaxScaler+mlp

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_pca = scaler.transform(X_test)

model = MLPClassifier(
    hidden_layer_sizes=(128, 32),
    activation='relu',
    solver='adam',
    max_iter=300,
    random_state=42
)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_pca)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### RobustScaler+mlp

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_pca = scaler.transform(X_test)

model = MLPClassifier(
    hidden_layer_sizes=(128, 32),
    activation='relu',
    solver='adam',
    max_iter=300,
    random_state=42
)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_pca)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### PCA+mlp

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTE(random_state=42)

for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")

    X_train_resampled, y_train_resampled = smote.fit_resample(X_pca_output, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = MLPClassifier(
        hidden_layer_sizes=(128, 32),
        activation='relu',
        solver='adam',
        max_iter=300,
        random_state=42
    )
    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(XT_pca_list[i])

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

### SMOTE+mlp

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTE(random_state=42)


X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

model = MLPClassifier(
    hidden_layer_sizes=(128, 32),
    activation='relu',
    solver='adam',
    max_iter=300,
    random_state=42
)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### StandardScaler+SMOTE+mlp

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTE(random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

model = MLPClassifier(
    hidden_layer_sizes=(128, 32),
    activation='relu',
    solver='adam',
    max_iter=300,
    random_state=42
)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### MinMaxScaler+SMOTE+MLP

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTE(random_state=42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

model = MLPClassifier(
    hidden_layer_sizes=(128, 32),
    activation='relu',
    solver='adam',
    max_iter=300,
    random_state=42
)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### RobustScaler+SMOTE+MLP

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTE(random_state=42)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Resampled class distribution in training set:")
print(y_train_resampled.value_counts())

model = MLPClassifier(
    hidden_layer_sizes=(128, 32),
    activation='relu',
    solver='adam',
    max_iter=300,
    random_state=42
)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### PCA+SMOTE+mlp

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTE(random_state=42)

for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")

    X_train_resampled, y_train_resampled = smote.fit_resample(X_pca_output, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = MLPClassifier(
        hidden_layer_sizes=(128, 32),
        activation='relu',
        solver='adam',
        max_iter=300,
        random_state=42
    )
    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(XT_pca_list[i])

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

### PCA+StandardScaler+SMOTE+mlp

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTE(random_state=42)

for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_pca_output)
    X_test_pca = scaler.transform(XT_pca_list[i])

    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = MLPClassifier(
        hidden_layer_sizes=(128, 32),
        activation='relu',
        solver='adam',
        max_iter=300,
        random_state=42
    )
    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(X_test_pca)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

### PCA+MinMaxScaler+SMOTE+MLP

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTE(random_state=42)

for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")

    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_pca_output)
    X_test_pca = scaler.transform(XT_pca_list[i])

    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = MLPClassifier(
        hidden_layer_sizes=(128, 32),
        activation='relu',
        solver='adam',
        max_iter=300,
        random_state=42
    )
    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(X_test_pca)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

### PCA+RobustScaler+SMOTE+MLP

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

smote = SMOTE(random_state=42)

for i, X_pca_output in enumerate(X_pca_list):
    print(f"\n=== Processing PCA Result with {X_pca_output.shape[1]} Features ===")

    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_pca_output)
    X_test_pca = scaler.transform(XT_pca_list[i])

    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

    print("Resampled class distribution in training set:")
    print(y_train_resampled.value_counts())

    model = MLPClassifier(
        hidden_layer_sizes=(128, 32),
        activation='relu',
        solver='adam',
        max_iter=300,
        random_state=42
    )
    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(X_test_pca)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

## SVM

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

smote = SMOTE(random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

svm_model = SVC(kernel='rbf', C=1, gamma='scale', random_state=42, class_weight='balanced')
svm_model.fit(X_train_resampled, y_train_resampled)

y_pred_svm = svm_model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred_svm))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))