In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
current_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(current_dir, '..', '..'))
data_dir = os.path.join(root_dir, 'data')
data_file_path = os.path.join(data_dir, 'breast_cancer_data.csv')

df_breast_cancer = pd.read_csv(data_file_path, sep=",", header=0)

# With Oversampled Dataset

## Random Forest Classifier

In [None]:
# Define a custom metric
def custom_metric(y_true, y_pred):
    return recall_score(y_true, y_pred, average='weighted')

# Define a function to perform the grid search for hyperparameter tuning
def tune_random_forest(X_train_best, y_train, n_estimators_list, max_samples_list, max_features_list):
    best_f1_score = 0
    best_params = {}
    np.random.seed(42)
    for n_estimators in n_estimators_list:
        for max_samples in max_samples_list:
            for max_features in max_features_list:
                for max_depth in max_depth_list:
                    rf_classifier = RandomForestClassifier(
                        n_estimators=n_estimators,
                        max_samples=max_samples,
                        max_features=max_features,
                        max_depth=max_depth,
                        criterion='entropy',
                        class_weight='balanced_subsample',
                        random_state=42,
                        oob_score=custom_metric
                    )

                    # Fit the model
                    rf_classifier.fit(X_train_best, y_train)
                    oob_score = rf_classifier.oob_score_
                    
                    # Update the best parameters based on F1-weighted score
                    if oob_score > best_f1_score:
                        best_f1_score = oob_score
                        best_params = {
                            'n_estimators': n_estimators,
                            'max_samples': max_samples,
                            'max_features': max_features,
                            'max_depth': max_depth,
                        }

    return best_params, best_f1_score

# Hyperparameters ranges to search
n_estimators_list = [100, 150]
max_samples_list = [0.6, 0.7, 0.8]
max_features_list = ['sqrt', 'log2', None]
max_depth_list = [10, None]

# Tune hyperparameters
best_params, best_recall_score = tune_random_forest(X_train_pca_best, y_train_resampled, n_estimators_list, max_samples_list, max_features_list)

print(f"Best parameters: {best_params}")
print(f"Best F1-weighted score: {best_recall_score}")

In [None]:
# Train the final model with the best parameters
rf_classifier = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_samples=best_params['max_samples'],
    max_features=best_params['max_features'],
    max_depth=best_params['max_depth'],
    random_state=42
)

rf_classifier.fit(X_train_pca_best, y_train_resampled)

y_pred = rf_classifier.predict(X_test_pca_best)

reverse_price_mapping = {0: 'Alive', 1: 'Dead'}

# Map the predictions and actual values using the reverse mapping
y_pred_dec = pd.Series(y_pred).map(reverse_price_mapping)
y_test_dec = pd.Series(y_test).map(reverse_price_mapping)

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test_dec, y_pred_dec))

In [None]:
classes = y_test_dec.unique()

# Confusion Matrix
conf_matrix = confusion_matrix(y_test_dec, y_pred_dec, labels=classes)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# With Original Dataset

## Random Forest Classifier

In [None]:
# Define a function to perform the grid search for hyperparameter tuning
def tune_random_forest(X_train_best, y_train, n_estimators_list, max_samples_list, max_features_list):
    best_recall_score = 0
    best_params = {}
    np.random.seed(42)
    for n_estimators in n_estimators_list:
        for max_samples in max_samples_list:
            for max_features in max_features_list:
                for max_depth in max_depth_list:
                    rf_classifier = RandomForestClassifier(
                        n_estimators=n_estimators,
                        max_samples=max_samples,
                        max_features=max_features,
                        max_depth=max_depth,
                        criterion='entropy',
                        class_weight='balanced_subsample',
                        random_state=42,
                        oob_score=custom_metric
                    )

                    # Fit the model
                    rf_classifier.fit(X_train_best, y_train, sample_weight=sample_weights)
                    oob_score = rf_classifier.oob_score_
                    
                    # Update the best parameters based on F1-weighted score
                    if oob_score > best_recall_score:
                        best_recall_score = oob_score
                        best_params = {
                            'n_estimators': n_estimators,
                            'max_samples': max_samples,
                            'max_features': max_features,
                            'max_depth': max_depth,
                        }

    return best_params, best_recall_score

# Hyperparameters ranges to search
n_estimators_list = [100, 150, 200]
max_samples_list = [0.6, 0.7, 0.8]
max_depth_list = [10, None]
max_features_list = ['sqrt', 'log2', None]

# Tune hyperparameters
best_params, best_f1_score = tune_random_forest(X_train_pca_best, y_train, n_estimators_list, max_samples_list, max_features_list)

print(f"Best parameters: {best_params}")
print(f"Best Recall-weighted score: {best_f1_score}")

In [None]:
# Train the final model with the best parameters
rf_classifier = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_samples=best_params['max_samples'],
    max_features=best_params['max_features'],
    max_depth=best_params['max_depth'],
    criterion='entropy',
    class_weight='balanced_subsample',
    random_state=42
)

rf_classifier.fit(X_train_pca_best, y_train, sample_weight=sample_weights)

y_pred = rf_classifier.predict(X_test_pca_best)

reverse_price_mapping = {0: 'Alive', 1: 'Dead'}

# Map the predictions and actual values using the reverse mapping
y_pred_dec = pd.Series(y_pred).map(reverse_price_mapping)
y_test_dec = pd.Series(y_test).map(reverse_price_mapping)

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test_dec, y_pred_dec))

In [None]:
classes = y_test_dec.unique()

# Confusion Matrix
conf_matrix = confusion_matrix(y_test_dec, y_pred_dec, labels=classes)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted')
plt.ylabel('True')
#plt.savefig('/content/drive/MyDrive/ML_Project/conf_matrix.png')
plt.show()