# Homework 5 RF Accuracy Improvement

This assignment is inspired by examples of Shan-Hung Wu from National Tsing Hua University.

Requirement: improve the accuracy per feature of the following code from 0.03 up to at least 0.45 and accuracy should be more than 0.92

Here are three hints:

    You can improve the ratio by picking out or "creating" several features.
    Tune hyperparameters
    The ratio can be improved from 0.03 up to 0.47.

In [None]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)


#Build dataframe
df = pd.DataFrame(X, columns=init_data.feature_names)

#Build correlation matrix to determine best features
corr_matrix = df.corrwith(pd.Series(y)).abs()

#Grab the top 2 features
features = corr_matrix.sort_values(ascending=False)[:2]

print(f"Features used for model: {list(features.index)}")

#Get the index values of the top 2 features
indices = []
for feat in features.index:
    indices.append(init_data.feature_names.tolist().index(feat))          
sel_features = X[:, indices]

#Build forest model
model = RandomForestClassifier(random_state=0)

#Parameters to hypertune
param_dist = {
    'n_estimators': list(range(50, 501, 50)),              
    'max_depth': [None, 10, 20, 30],                  
    'min_samples_split': [2, 5, 10, 20, 50],                  
    'min_samples_leaf': [1, 2, 4, 6, 8]      
}

#Use Randomized Search to hypertune parameters with 5 folds
search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=0)

search.fit(sel_features, y)

#Build model with best parameters
model = RandomForestClassifier(**search.best_params_, random_state=0)

print(f"Parameters used for RandomForestClassifier: {search.best_params_}")

#Train model with the top selected features and outputs
model.fit(sel_features, y)

# TODO Calculate Average accuracy score
cv_scores = cross_val_score(model, sel_features, y, cv=5, scoring='accuracy')
average_accuracy = np.mean(cv_scores)

# TODO Calculate Average (accuracy score/number of features)
num_features = sel_features.shape[1]
accuracy_per_feature = average_accuracy / num_features

# Print results
print(f"Average Accuracy: {average_accuracy:.4f}")
print(f"Accuracy per Feature: {accuracy_per_feature:.4f}")


(569, 30)
Accuracy: 0.95
Accuracy per feature: 0.47
