> by: Lucas Martinez

In [1]:
import pandas as pd
import numpy as np

# Step 1: Preprocessing [15]


> Look through your data for outliers, perform standardization/normalization and handle missing values. Use dimensionality reduction if your dataset has a lot of features.

In [2]:
headers = [
   'Age','Race','Marital Status','T Stage' ,'N Stage','6th Stage','differentiate','Grade','A Stage','Tumor Size','Estrogen Status',
   'Progesterone Status','Regional Node Examined','Regional Node Positive','Survival Months','Status'
]

raw_breastcancer_df = pd.read_csv('Breast_Cancer_dataset.csv', names=headers, header=0)

# These lines below just moves the status label to the first column
# I just did this because I started working with the hepatitis dataset (which has the status as first column)
# and so by doing this I do not need to refactor much of the code 
columns = ['Status'] + [col for col in raw_breastcancer_df.columns if col != 'Status']
raw_breastcancer_df = raw_breastcancer_df[columns]

# Visualize data
raw_breastcancer_df

Unnamed: 0,Status,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Regional Node Positive,Survival Months
0,Alive,68.0,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4.0,Positive,Positive,24.0,1,60
1,Alive,50.0,White,,T2,N2,IIIA,Moderately differentiated,2,Regional,35.0,Positive,Positive,14.0,5,62
2,Alive,58.0,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63.0,Positive,Positive,14.0,7,75
3,Alive,58.0,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,,Positive,Positive,2.0,1,84
4,Alive,47.0,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41.0,,Positive,3.0,1,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4019,Alive,62.0,,Married,T1,N1,IIA,Moderately differentiated,2,Regional,9.0,Positive,Positive,1.0,1,49
4020,Alive,56.0,White,,T2,N2,IIIA,Moderately differentiated,2,Regional,46.0,Positive,Positive,14.0,8,69
4021,Alive,68.0,White,Married,T2,N1,IIB,Moderately differentiated,2,Regional,22.0,Positive,Negative,11.0,3,69
4022,Alive,58.0,Black,Divorced,T2,N1,IIB,Moderately differentiated,2,Regional,44.0,Positive,Positive,11.0,1,72


## Encoding Categorical (string) Data

In [3]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = [
    'Race', 'Marital Status', 'T Stage', 'N Stage', '6th Stage', 'differentiate',
    'Grade', 'A Stage', 'Estrogen Status', 'Progesterone Status', 'Status'
]
numerical_cols = raw_breastcancer_df.columns.difference(categorical_cols)

breastcancer_df = raw_breastcancer_df.copy()

label_encoders = {}

# Encoding not NULLs categorical values
for col in categorical_cols:
    encoder = LabelEncoder()
    non_null_values = breastcancer_df[col].dropna()
    encoder.fit(non_null_values)
    
    breastcancer_df[col] = breastcancer_df[col].apply(
        lambda x: encoder.transform([x])[0] if pd.notnull(x) else x
    )
    label_encoders[col] = encoder

breastcancer_df

Unnamed: 0,Status,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Regional Node Positive,Survival Months
0,0,68.0,2.0,1.0,0,0,0,1,3,1,4.0,1.0,1,24.0,1,60
1,0,50.0,2.0,,1,1,2,0,2,1,35.0,1.0,1,14.0,5,62
2,0,58.0,2.0,0.0,2,2,4,0,2,1,63.0,1.0,1,14.0,7,75
3,0,58.0,2.0,1.0,0,0,0,1,3,1,,1.0,1,2.0,1,84
4,0,47.0,2.0,1.0,1,0,1,1,3,1,41.0,,1,3.0,1,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4019,0,62.0,,1.0,0,0,0,0,2,1,9.0,1.0,1,1.0,1,49
4020,0,56.0,2.0,,1,1,2,0,2,1,46.0,1.0,1,14.0,8,69
4021,0,68.0,2.0,1.0,1,0,1,0,2,1,22.0,1.0,0,11.0,3,69
4022,0,58.0,0.0,0.0,1,0,1,0,2,1,44.0,1.0,1,11.0,1,72


Notice that now Alive = 0 and Dead = 1

In [4]:
# Lets look at the labels just for fun
for col, encoder in label_encoders.items():
    print(f"{col}: {encoder.classes_}")

Race: ['Black' 'Other' 'White']
Marital Status: ['Divorced' 'Married' 'Separated' 'Single ' 'Widowed']
T Stage: ['T1' 'T2' 'T3' 'T4']
N Stage: ['N1' 'N2' 'N3']
6th Stage: ['IIA' 'IIB' 'IIIA' 'IIIB' 'IIIC']
differentiate: ['Moderately differentiated' 'Poorly differentiated' 'Undifferentiated'
 'Well differentiated']
Grade: [' anaplastic; Grade IV' '1' '2' '3']
A Stage: ['Distant' 'Regional']
Estrogen Status: ['Negative' 'Positive']
Progesterone Status: ['Negative' 'Positive']
Status: ['Alive' 'Dead']


## Filling missing values

In [5]:
from sklearn.impute import KNNImputer

In [6]:
# Replace NaN with None
breastcancer_df = breastcancer_df.where(pd.notnull(breastcancer_df), None)

# Apply KNN Imputer: fill missing values with the mean of their k-nearest neighbors
imputer = KNNImputer(n_neighbors=5)
breastcancer_array = imputer.fit_transform(breastcancer_df)
breastcancer_df = pd.DataFrame(breastcancer_array, columns=breastcancer_df.columns)

# Round categorical columns to the nearest integer
breastcancer_df[categorical_cols] = breastcancer_df[categorical_cols].round()

# Visualize the data
breastcancer_df

Unnamed: 0,Status,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Regional Node Positive,Survival Months
0,0.0,68.0,2.0,1.0,0.0,0.0,0.0,1.0,3.0,1.0,4.0,1.0,1.0,24.0,1.0,60.0
1,0.0,50.0,2.0,2.0,1.0,1.0,2.0,0.0,2.0,1.0,35.0,1.0,1.0,14.0,5.0,62.0
2,0.0,58.0,2.0,0.0,2.0,2.0,4.0,0.0,2.0,1.0,63.0,1.0,1.0,14.0,7.0,75.0
3,0.0,58.0,2.0,1.0,0.0,0.0,0.0,1.0,3.0,1.0,15.0,1.0,1.0,2.0,1.0,84.0
4,0.0,47.0,2.0,1.0,1.0,0.0,1.0,1.0,3.0,1.0,41.0,1.0,1.0,3.0,1.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4019,0.0,62.0,2.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,9.0,1.0,1.0,1.0,1.0,49.0
4020,0.0,56.0,2.0,1.0,1.0,1.0,2.0,0.0,2.0,1.0,46.0,1.0,1.0,14.0,8.0,69.0
4021,0.0,68.0,2.0,1.0,1.0,0.0,1.0,0.0,2.0,1.0,22.0,1.0,0.0,11.0,3.0,69.0
4022,0.0,58.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,1.0,44.0,1.0,1.0,11.0,1.0,72.0


## Dealing with outliers

In [7]:
from sklearn.neighbors import LocalOutlierFactor

In [8]:
# Use LOF model
X = breastcancer_df.drop(columns=["Status"])
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
lof_pred = lof.fit_predict(X)

# Add 'outlier' attribure
breastcancer_df['outlier'] = lof_pred
lof_outliers = breastcancer_df[breastcancer_df['outlier'] == -1]
lof_outliers

Unnamed: 0,Status,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Regional Node Positive,Survival Months,outlier
0,0.0,68.0,2.0,1.0,0.0,0.0,0.0,1.0,3.0,1.0,4.0,1.0,1.0,24.0,1.0,60.0,-1
46,1.0,42.0,1.0,1.0,0.0,2.0,4.0,0.0,2.0,1.0,9.0,0.0,0.0,15.0,2.0,39.0,-1
54,0.0,57.0,2.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,1.0,0.0,16.0,1.0,74.0,-1
57,0.0,50.0,2.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,18.0,1.0,0.0,49.0,1.0,106.0,-1
66,1.0,31.0,2.0,1.0,2.0,2.0,4.0,1.0,3.0,1.0,70.0,1.0,1.0,23.0,23.0,44.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3934,0.0,39.0,2.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,6.0,1.0,1.0,27.0,3.0,77.0,-1
3942,0.0,44.0,2.0,1.0,1.0,2.0,4.0,1.0,3.0,1.0,40.0,1.0,0.0,31.0,16.0,84.0,-1
3950,0.0,47.0,2.0,1.0,1.0,2.0,4.0,0.0,2.0,1.0,25.0,1.0,1.0,60.0,19.0,70.0,-1
3957,0.0,53.6,2.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,12.2,1.0,1.0,37.0,2.0,67.0,-1


Since there are few outliers, lets just remove them

In [9]:
# Keep only non-outliers and then drop the 'outlier' attribute
breastcancer_df = breastcancer_df[breastcancer_df['outlier'] != -1].drop(columns=["outlier"])
breastcancer_df

Unnamed: 0,Status,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Regional Node Positive,Survival Months
1,0.0,50.0,2.0,2.0,1.0,1.0,2.0,0.0,2.0,1.0,35.0,1.0,1.0,14.0,5.0,62.0
2,0.0,58.0,2.0,0.0,2.0,2.0,4.0,0.0,2.0,1.0,63.0,1.0,1.0,14.0,7.0,75.0
3,0.0,58.0,2.0,1.0,0.0,0.0,0.0,1.0,3.0,1.0,15.0,1.0,1.0,2.0,1.0,84.0
4,0.0,47.0,2.0,1.0,1.0,0.0,1.0,1.0,3.0,1.0,41.0,1.0,1.0,3.0,1.0,50.0
5,0.0,51.0,2.0,3.0,0.0,0.0,0.0,0.0,2.0,1.0,20.0,1.0,1.0,18.0,2.0,89.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4019,0.0,62.0,2.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,9.0,1.0,1.0,1.0,1.0,49.0
4020,0.0,56.0,2.0,1.0,1.0,1.0,2.0,0.0,2.0,1.0,46.0,1.0,1.0,14.0,8.0,69.0
4021,0.0,68.0,2.0,1.0,1.0,0.0,1.0,0.0,2.0,1.0,22.0,1.0,0.0,11.0,3.0,69.0
4022,0.0,58.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,1.0,44.0,1.0,1.0,11.0,1.0,72.0


## Standardization

In [10]:
def standardize_numerical_cols(dataframe, numerical_cols):
    for col in numerical_cols:
        mean = dataframe[col].mean()
        std_dev = dataframe[col].std()
        dataframe[col] = (dataframe[col] - mean) / std_dev

In [11]:
# Lets visualize our continuous (numerical) columns
numerical_cols

Index(['Age', 'Regional Node Examined', 'Regional Node Positive',
       'Survival Months', 'Tumor Size'],
      dtype='object')

In [12]:
standardize_numerical_cols(breastcancer_df, numerical_cols)
breastcancer_df

Unnamed: 0,Status,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Regional Node Positive,Survival Months
1,0.0,-0.475269,2.0,2.0,1.0,1.0,2.0,0.0,2.0,1.0,0.226391,1.0,1.0,0.042996,0.260534,-0.418027
2,0.0,0.457047,2.0,0.0,2.0,2.0,4.0,0.0,2.0,1.0,1.611400,1.0,1.0,0.042996,0.704476,0.153349
3,0.0,0.457047,2.0,1.0,0.0,0.0,0.0,1.0,3.0,1.0,-0.762902,1.0,1.0,-1.724633,-0.627349,0.548917
4,0.0,-0.824888,2.0,1.0,1.0,0.0,1.0,1.0,3.0,1.0,0.523178,1.0,1.0,-1.577331,-0.627349,-0.945450
5,0.0,-0.358730,2.0,3.0,0.0,0.0,0.0,0.0,2.0,1.0,-0.515579,1.0,1.0,0.632206,-0.405378,0.768676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4019,0.0,0.923206,2.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,-1.059689,1.0,1.0,-1.871935,-0.627349,-0.989402
4020,0.0,0.223968,2.0,1.0,1.0,1.0,2.0,0.0,2.0,1.0,0.770501,1.0,1.0,0.042996,0.926446,-0.110363
4021,0.0,1.622443,2.0,1.0,1.0,0.0,1.0,0.0,2.0,1.0,-0.416649,1.0,0.0,-0.398911,-0.183408,-0.110363
4022,0.0,0.457047,0.0,0.0,1.0,0.0,1.0,0.0,2.0,1.0,0.671572,1.0,1.0,-0.398911,-0.627349,0.021493


Finally, I decided to not apply PCA, although ata has 15 features, which might seem like a lot but after doing (in next steps) some feature selection it will get reduced. In addition, if I applied PCA, I wouldnt be able to (later) observe and easily understand which features are more representatives.

# Step 2: Modeling [15] – For this step you can use tools and/or libraries

> Apply the Feature Selection and Feature Ranking Techniques we covered in class and/or a combination of both approaches.

In [13]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

In [14]:
def entropy(x):
    counts = Counter(x)
    
    probabilities = [count / len(x) for count in counts.values()]
    return -sum(p * np.log2(p) for p in probabilities)

In [15]:
def feature_entropies(X):
    entropies = {}
    for col in X.columns:
        entropies[col] = entropy(X[col])
    
    # Sort features by entropy
    entropies = pd.Series(entropies).sort_values(ascending=False)
    return entropies

In [16]:
X = breastcancer_df.drop(columns=["Status"])
y = breastcancer_df["Status"]
entropies = feature_entropies(X)

In [17]:
X = breastcancer_df.drop(columns=["Status"])
y = breastcancer_df["Status"]

# Rank features using entropy and reorder dataframe
feature_scores = feature_entropies(X)
sorted_attr = feature_scores.index.tolist()
X_ranked = X[sorted_attr]

# Sequential Feature Selector with LogisticRegression model
n = breastcancer_df.shape[1] - 1 # number of attributes excluding Status label
n_keep = round(0.7*n) # Keep 70% of the features

sfs = SequentialFeatureSelector(
    LogisticRegression(),
          n_features_to_select=n_keep,
          scoring='accuracy')

sfs.fit(X_ranked, y)

# Display selected features
selected_features = sfs.get_support()
keep_features_cols = list(X.columns[selected_features])

# Visualize what features we are keeping/selecting
keep_features_cols

['Age',
 'Race',
 'Marital Status',
 'N Stage',
 '6th Stage',
 'differentiate',
 'Grade',
 'Estrogen Status',
 'Regional Node Examined',
 'Survival Months']

In [18]:
keep_features_cols.insert(0, "Status") # Keep our Status label
breastcancer_df = breastcancer_df[keep_features_cols]

# Visualize our data after feature selection and ranking
breastcancer_df

Unnamed: 0,Status,Age,Race,Marital Status,N Stage,6th Stage,differentiate,Grade,Estrogen Status,Regional Node Examined,Survival Months
1,0.0,-0.475269,2.0,2.0,1.0,2.0,0.0,2.0,1.0,0.042996,-0.418027
2,0.0,0.457047,2.0,0.0,2.0,4.0,0.0,2.0,1.0,0.042996,0.153349
3,0.0,0.457047,2.0,1.0,0.0,0.0,1.0,3.0,1.0,-1.724633,0.548917
4,0.0,-0.824888,2.0,1.0,0.0,1.0,1.0,3.0,1.0,-1.577331,-0.945450
5,0.0,-0.358730,2.0,3.0,0.0,0.0,0.0,2.0,1.0,0.632206,0.768676
...,...,...,...,...,...,...,...,...,...,...,...
4019,0.0,0.923206,2.0,1.0,0.0,0.0,0.0,2.0,1.0,-1.871935,-0.989402
4020,0.0,0.223968,2.0,1.0,1.0,2.0,0.0,2.0,1.0,0.042996,-0.110363
4021,0.0,1.622443,2.0,1.0,0.0,1.0,0.0,2.0,1.0,-0.398911,-0.110363
4022,0.0,0.457047,0.0,0.0,0.0,1.0,0.0,2.0,1.0,-0.398911,0.021493


> Train the following algorithms on your dataset.

> You can experiment with neural networks too and see if you achieve better performance. NOTE: For each model used, be sure to include a 1-2 line summary as well as the pros and cons of each algorithm and list out its main hyperparameters.

In [19]:
X = breastcancer_df.drop(columns=["Status"])
y = breastcancer_df["Status"]

# For comparing later
old_accuracy = [None, None]
old_f1 = [None, None]
old_precision = [None, None]
old_recall = [None, None]
old_r2 = [None, None]
old_conf_matrix = [None, None]

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, r2_score, confusion_matrix

SEED = 17346592 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

## 2.1: KNN ( this should be implemented from scratch, do NOT use in-built libraries)

KNN is an algorithm that classifies new datapoints based on the mode of the k most similar training datapoints. 

- Pros: Intuitive and easy to implement; good for low-dimensional data; no assumption on data distribution.

- Cons: Computationally expensive on large datasets; sensitive to outliers; performance depends on the value of k .

Hyperparameters: 

- k: Number of nearest neighbors to which to compare

- distance metric: e.g., Euclidean, Manhattan, etc.

In [21]:
class KNN:
    def __init__(self, k=3):
        self.k = k

    def euclidean_distance(self, x1, x2):
        distance = 0.0
        for i in range(len(x1)):
            distance += (x1[i] - x2[i])**2
        return distance**0.5
    
    def get_mode(self, labels):
        d = {}
        for label in labels:
            if label in d.keys():
                d[label]+=1
            else:
                d[label] = 1

        mode_val = None
        mode_label = None
        for label in d:
            if mode_label == None:
                mode_val = d[label]
                mode_label = label
            elif d[label] > mode_val:
                mode_val = d[label]
                mode_label = label
        return mode_label
    
    def predict(self, X_train, y_train, X_test):
        X_train, y_train, X_test =  X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy()
        y_pred = []

        for x_test in X_test:
            distances = [self.euclidean_distance(x_test, x_train) for x_train in X_train]

            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [y_train[i] for i in k_indices]

            # Set the mode of the label of the k neares neighbors as prediction
            pred = self.get_mode(k_nearest_labels)
            y_pred.append(pred)

        return np.array(y_pred)

In [22]:
model = KNN(k=5)
y_pred_knn = model.predict(X_train, y_train, X_test)
accuracy_score(y_pred_knn, y_test)

0.8862745098039215

## 2.2: Naïve Bayes

Naïve Bayes is a classifier based on Bayes' theorem, with the “naive” assumption of conditional independence between features.

- Pros: Fast and simple; performs well on large datasets and text classification. 

- Cons: Assumption of feature independence; sensitive to the zero class-probability issue. 

Hyperparameters: 

- Smoothing parameter (e.g., Laplace smoothing where the idea is to to add a small positive value to each conditional probability values to avoid zero values).

In [23]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
y_pred_bayes = model.fit(X_train, y_train).predict(X_test)
accuracy_score(y_pred_bayes, y_test)

0.8444444444444444

## 2.3: C4.5 Decision Tree

C4.5 is a type of decision tree that is built by splitting data based on the concept of information entropy to maximize class separation. 

- Pros: Easy to interpret; works with both numerical and categorical data; can handle non-linear relationships.

- Cons: Not good with large datasets. 

Hyperparameters: 

- Minimum samples per leaf
- Maximum tree depth

In [24]:
from C45 import C45Classifier

model = C45Classifier()
model.fit(X_train, y_train)
y_pred_c45 = model.predict(X_test)
accuracy_score(y_pred_c45, y_test)

0.7372549019607844

## 2.4: Random Forest

Random Forest is an ensemble method that consists on multiple decision trees and then averages their predictions. 
 
- Pros: Works well with large datasets and high-dimensional data. Less likely to overfit than a single tree.
 
- Cons: Can be less interpretable than a single tree; requires more computational resources. 

Hyperparameters: 
- Number of trees
- Maximum depth
- Minimum samples per split
- The criterion: eg. gini, entropy, log_loss, etc

In [25]:
model = RandomForestClassifier(random_state=SEED)
y_pred_forest = model.fit(X_train, y_train).predict(X_test)
accuracy_score(y_pred_forest, y_test)

0.9111111111111111

## 2.5: Gradient Boosting

Gradient Boosting is an ensemble method that builds trees sequentially, so that each tree corrects the errors of its predecessors. 

- Pros: Usually achives high accuracy; relatively robust to outliers.

- Cons: Computationally intensive; prone to overfitting; less interpretable.

Hyperparameters: 
- Number of trees
- Learning rate
- Maximum depth
- Minimum samples per leaf
- The criterion: eg. gini, entropy, log_loss, etc

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=SEED)
y_pred_gradboost = model.fit(X_train, y_train).predict(X_test)
accuracy_score(y_pred_gradboost, y_test)

0.9019607843137255

In [27]:
old_accuracy[0] = accuracy_score(y_test, y_pred_gradboost)
old_f1[0] = f1_score(y_test, y_pred_gradboost, average='binary', pos_label=0)
old_precision[0] = precision_score(y_test, y_pred_gradboost, average='binary', pos_label=0)
old_recall[0] = recall_score(y_test, y_pred_gradboost, average='binary', pos_label=0)
old_r2[0] = r2_score(y_test, y_pred_gradboost)
old_conf_matrix[0] = confusion_matrix(y_test, y_pred_gradboost)

## 2.6: Neural Networks

The MLP classifier is a feedforward neural network, that consists of multiple layers of neurons, that maps input data to an output. It learns the data relationship through it's hidden layers.

- Pros: Can model complex, non-linear relationships; Works well with both structured and unstructured data; It's versatile: effective across various domains (eg.: image recognition, speech recognition, time series prediction, etc)

- Cons: Requires large amounts of data and computational power; Sensitive to hyperparameter tuning and prone to overfitting; Less interpretable compared to simpler models.

Hyperparameters:
- Number of hidden layers and neurons per layer
- Activation function (e.g., ReLU, tanh, sigmoid)
- Learning rate
- Batch size
- Number of epochs
- Regularization parameters (e.g., L2 regularization, etc)

In [28]:
from sklearn.neural_network import MLPClassifier
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress ConvergenceWarnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

model = MLPClassifier(max_iter=1000, random_state=SEED)

model.fit(X_train, y_train)

y_pred_nn = model.predict(X_test)
accuracy_score(y_test, y_pred_nn)

0.9006535947712418

In [29]:
old_accuracy[1] = accuracy_score(y_test, y_pred_nn)
old_f1[1] = f1_score(y_test, y_pred_nn, average='binary', pos_label=0)
old_precision[1] = precision_score(y_test, y_pred_nn, average='binary', pos_label=0)
old_recall[1] = recall_score(y_test, y_pred_nn, average='binary', pos_label=0)
old_r2[1] = r2_score(y_test, y_pred_nn)
old_conf_matrix[1] = confusion_matrix(y_test, y_pred_nn)

# Step 3: Hyperparameter Tuning [15]

> Pick any 2 of the above algorithms that contain at least 2 hyperparameters and perform a hyperparameter search using either Grid or Random search. Display the performance metrics and conclude which set of hyperparameters worked the best.

In [30]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


accuracy = [None, None]
f1 = [None, None]
precision = [None, None]
recall = [None, None]
r2 = [None, None]
conf_matrix = [None, None]

In [31]:
# Suppress Warnings
warnings.filterwarnings("ignore")

grid_search = RandomizedSearchCV(GradientBoostingClassifier(random_state=SEED),
                           {
                                'n_estimators': np.arange(5, 150, 5),
                                'max_features': np.arange(0.1 ,1.0, 0.01),
                                'learning_rate': np.arange(0.01, 1.0, 0.05)
                            },cv=5, scoring="r2", verbose=1, n_jobs=-1, n_iter=1000,
                           )
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


In [32]:
# Best parameters and best score on the validation set
print('GradientBoostingClassifier')
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated R2 Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy[0] = accuracy_score(y_test, y_pred)
f1[0] = f1_score(y_test, y_pred, average='binary', pos_label=0)
precision[0] = precision_score(y_test, y_pred, average='binary', pos_label=0)
recall[0] = recall_score(y_test, y_pred, average='binary', pos_label=0)
r2[0] = r2_score(y_test, y_pred)
conf_matrix[0] = confusion_matrix(y_test, y_pred)

print("\nPerformance Metrics on Test Set:")
print(f"Accuracy: {accuracy[0]:.4f}")
print(f"F1 Score: {f1[0]:.4f}")
print(f"Precision: {precision[0]:.4f}")
print(f"Recall: {recall[0]:.4f}")
print(f"R2 Score: {r2[0]:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix[0])

GradientBoostingClassifier
Best Parameters: {'n_estimators': np.int64(15), 'max_features': np.float64(0.8399999999999996), 'learning_rate': np.float64(0.26)}
Best Cross-Validated R2 Score: 0.2703722764759232

Performance Metrics on Test Set:
Accuracy: 0.9085
F1 Score: 0.9478
Precision: 0.9258
Recall: 0.9710
R2 Score: 0.2568

Confusion Matrix:
[[636  19]
 [ 51  59]]


In [33]:
gradboost_feature_importances = best_model.feature_importances_
feature_names = X_train.columns
gradboost_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': gradboost_feature_importances
}).sort_values(by='Importance', ascending=False)

# Add a Rank column
gradboost_importance_df.insert(0, 'Rank', range(1, len(gradboost_importance_df) + 1))

In [34]:
# Suppress Warnings
warnings.filterwarnings("ignore")

grid_search = RandomizedSearchCV(MLPClassifier(random_state=SEED, max_iter=1000), 
                           {
                              'learning_rate_init': np.arange(0.001, 1.0, 0.05),
                              'alpha': np.arange(0.0001, 0.1, 0.005),
                            },cv=5, scoring="r2", verbose=1, n_jobs=-1, n_iter=1000,
                           )
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


In [35]:
# Best parameters and best score on the validation set
print('MLPClassifier')
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated R2 Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy[1] = accuracy_score(y_test, y_pred)
f1[1] = f1_score(y_test, y_pred, average='binary', pos_label=0)
precision[1] = precision_score(y_test, y_pred, average='binary', pos_label=0)
recall[1] = recall_score(y_test, y_pred, average='binary', pos_label=0)
r2[1] = r2_score(y_test, y_pred)
conf_matrix[1] = confusion_matrix(y_test, y_pred)

print("\nPerformance Metrics on Test Set:")
print(f"Accuracy: {accuracy[1]:.4f}")
print(f"F1 Score: {f1[1]:.4f}")
print(f"Precision: {precision[1]:.4f}")
print(f"Recall: {recall[1]:.4f}")
print(f"R2 Score: {r2[1]:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

MLPClassifier
Best Parameters: {'learning_rate_init': np.float64(0.051000000000000004), 'alpha': np.float64(0.0251)}
Best Cross-Validated R2 Score: 0.2029555728979912

Performance Metrics on Test Set:
Accuracy: 0.9098
F1 Score: 0.9492
Precision: 0.9174
Recall: 0.9832
R2 Score: 0.2674

Confusion Matrix:
[array([[636,  19],
       [ 51,  59]]), array([[644,  11],
       [ 58,  52]])]


In this step, I've found hyperparameters that achieves better accuracy than the one initially tested with, in the next step I will compare them side-by-side in a table.

# Step 4: Results [5]

> Display your results using a table and explain whether you were able to answer your initial question or not. (Note: Points will not be deducted for poor results as long as the processes followed were sound).
If the models you have used allow it, present what were the most important features used in the classification.

In [36]:
from prettytable import PrettyTable

# GradientBoostingClassifier results
table = PrettyTable()
table.field_names = ["Metric", "Score (un-optimized)", "Score (optimized)"]
table.add_rows([
    ["Accuracy", f"{old_accuracy[0]:.4f}", f"{accuracy[0]:.4f}"], ["Precision", f"{old_precision[0]:.4f}", f"{precision[0]:.4f}"], 
    ["Recall", f"{old_recall[0]:.4f}", f"{recall[0]:.4f}"], ["F1 Score", f"{old_f1[0]:.4f}", f"{f1[0]:.4f}"]
])

print("GradientBoostingClassifier")
print(table)

# MLPClassifier results
table = PrettyTable()
table.field_names = ["Metric", "Score (un-optimized)", "Score (optimized)"]
table.add_rows([
    ["Accuracy", f"{old_accuracy[1]:.4f}", f"{accuracy[1]:.4f}"], ["Precision", f"{old_precision[1]:.4f}", f"{precision[1]:.4f}"], 
    ["Recall", f"{old_recall[1]:.4f}", f"{recall[1]:.4f}"], ["F1 Score", f"{old_f1[1]:.4f}", f"{f1[1]:.4f}"]
])

print("\n\nMLPClassifier")
print(table)


GradientBoostingClassifier
+-----------+----------------------+-------------------+
|   Metric  | Score (un-optimized) | Score (optimized) |
+-----------+----------------------+-------------------+
|  Accuracy |        0.9020        |       0.9085      |
| Precision |        0.9265        |       0.9258      |
|   Recall  |        0.9618        |       0.9710      |
|  F1 Score |        0.9438        |       0.9478      |
+-----------+----------------------+-------------------+


MLPClassifier
+-----------+----------------------+-------------------+
|   Metric  | Score (un-optimized) | Score (optimized) |
+-----------+----------------------+-------------------+
|  Accuracy |        0.9007        |       0.9098      |
| Precision |        0.9226        |       0.9174      |
|   Recall  |        0.9649        |       0.9832      |
|  F1 Score |        0.9433        |       0.9492      |
+-----------+----------------------+-------------------+


Neural network does not have a way for features important, but Gradient Boosting does:

In [37]:
print("Most important features for GradientBoostingClassifier:")
print(gradboost_importance_df.to_string(index=False))

Most important features for GradientBoostingClassifier:
 Rank                Feature  Importance
    1        Survival Months    0.813418
    2                    Age    0.051888
    3              6th Stage    0.041943
    4                N Stage    0.032445
    5        Estrogen Status    0.023732
    6          differentiate    0.011798
    7 Regional Node Examined    0.010068
    8                  Grade    0.008025
    9                   Race    0.006682
   10         Marital Status    0.000000
