# ***UCI Breast Cancer Pipeline Project***
###
### Some noteworthy information from UCI:
1) ID number
2) Diagnosis (M = malignant, B = benign)
3-32)

**Ten real-valued features are computed for each cell nucleus:**

1) radius (mean of distances from center to points on the perimeter)
2) texture (standard deviation of gray-scale values)
3) perimeter
4) area
5) smoothness (local variation in radius lengths)
6) compactness (perimeter^2 / area - 1.0)
7) concavity (severity of concave portions of the contour)
8) concave points (number of concave portions of the contour)
9) symmetry
10) fractal dimension ("coastline approximation" - 1)
###


## 0. Import Modules:

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression, Lasso, Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

## 1. Import UCI Dataset &#8594; Write dataset to local csv &#8594; Search for missing values and verify shape

In [2]:
## Import UCI Dataset and write to local csv
# from ucimlrepo import fetch_ucirepo
# breast_ca = fetch_ucirepo(id=17)

# breast_ca_df = breast_ca.data.original
# breast_ca_df.to_csv('UCI_BreastCancer.csv', index=False)
# print('Successfully wrote dataset to csv file!')

# Read csv and store as df
df = pd.read_csv('UCI_BreastCancer.csv')

# Search Dataset for missing / null values
try:
    if df.isnull().sum().any()>0:
        print('NaN values found: ', df.isnull().sum())
    else:
        print('No NaN or null values found')
except Exception as e:
    print(e)

# Verify features and shape
print(df.columns)
print(df.shape)

No NaN or null values found
Index(['ID', 'radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1',
       'compactness1', 'concavity1', 'concave_points1', 'symmetry1',
       'fractal_dimension1', 'radius2', 'texture2', 'perimeter2', 'area2',
       'smoothness2', 'compactness2', 'concavity2', 'concave_points2',
       'symmetry2', 'fractal_dimension2', 'radius3', 'texture3', 'perimeter3',
       'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3',
       'symmetry3', 'fractal_dimension3', 'Diagnosis'],
      dtype='object')
(569, 32)


## 2. Define Target (y) and Features (X) &#8594; Convert Target to Binary &#8594; Train_Test_Split()

In [3]:
# Define features and target
y = df.Diagnosis
X = df.drop(columns=['Diagnosis'])
print(X.shape)
print(y.shape)

# Convert target data to binary and verify value_counts.
print('\nPrior to binary conversion: \n',y.value_counts())
try:
    y = pd.DataFrame(np.where(y == 'M',1,0), columns=['Diagnosis'])
    y = y.Diagnosis
    print('\nPost binary conversion: \n',y.value_counts(),'\n')

except Exception as e:
    print(e)

print(X.shape)
print(y.shape)

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

(569, 31)
(569,)

Prior to binary conversion: 
 Diagnosis
B    357
M    212
Name: count, dtype: int64

Post binary conversion: 
 Diagnosis
0    357
1    212
Name: count, dtype: int64 

(569, 31)
(569,)


## 2.5. Prepare Classifier Switching Class:

In [4]:
from sklearn.base import BaseEstimator
from sklearn.linear_model import SGDClassifier

class ClfSwitch(BaseEstimator):
    def __init__(self, estimator=SGDClassifier()):
        self.estimator = estimator
    def fit(self,xx,yy=None,**kwargs):
        self.estimator.fit(xx,yy)
        return self
    def predict(self,xx,yy=None):
        return self.estimator.predict(xx)
    def predict_proba(self,xx,yy=None):
        return self.estimator.predict_proba(xx)
    def score(self,xx,yy):
        return self.estimator.score(xx,yy)

## 3. Preprocessing / Scaling / Exploratory Data Analysis:
**Just under 30 sec runtime**

In [5]:
scaler = StandardScaler()

## All features are numeric
# print(X_train.nunique())

preprocessor = ColumnTransformer([
    ('scaler', scaler, X_train.columns)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', ClfSwitch())
])

search_space = [
    {'clf__estimator': [RandomForestClassifier(random_state=13)],
     'clf__estimator__max_depth':[10,15,25],
     'clf__estimator__n_estimators':[75,100,125,150],
     },
    {'clf__estimator': [GradientBoostingClassifier(random_state=13)],
     'clf__estimator__learning_rate':np.logspace(-4,-1,4),
     'clf__estimator__n_estimators':[75,100,125,150],
     }
#    {'clf__estimator': [SGDClassifier()],
#     'clf__estimator__loss': ['hinge','squared_loss','log'],
#     'clf__estimator__alpha': np.logspace(-4,-1,9),
#     'clf__estimator__penalty': ['l1', 'l2']
#     }
]

gs = GridSearchCV(estimator=pipeline, param_grid=search_space, cv=5, error_score='raise')
gs.fit(X_train, y_train)


## 3.5. Analyze results and prepare to tune hyperparameters:

In [18]:
from sklearn.metrics import accuracy_score
gs_best = gs.best_estimator_
gs_best_clf = gs_best.named_steps['clf']
print(gs_best_clf.get_params()['estimator'])
print(gs_best_clf.get_params()['estimator__max_depth'])
print(gs_best_clf.get_params()['estimator__n_estimators'])
print(gs_best.score(X_test, y_test))
y_pred = gs_best.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(gs.best_score_)

""" WORKING ON USING .CV_RESULTS_ AS A PD.DF FOR HYPERPARAMETER TUNING AND ANALYSIS """
# cv_results_column_names = ['mean_fit_time','std_fit_time','mean_score_time','std_score_time','estimator','max_depth','learning_rate', 'split0_score','split1_score','split2_score','split3_score','split4_score','mean_score','std_score','rank_score']
cv_df = pd.DataFrame(gs.cv_results_)
columns_interest = ['param_clf__estimator','param_clf__estimator__max_depth','param_clf__estimator__n_estimators','param_clf__estimator__learning_rate','mean_test_score','std_test_score','rank_test_score']
cv_df_results = cv_df[columns_interest].round(3)
cv_df_results.style.background_gradient(axis=0,cmap='coolwarm')

RandomForestClassifier(max_depth=10, n_estimators=150, random_state=13)
10
150
0.9298245614035088
0.9298245614035088
0.9604395604395604


Unnamed: 0,param_clf__estimator,param_clf__estimator__max_depth,param_clf__estimator__n_estimators,param_clf__estimator__learning_rate,mean_test_score,std_test_score,rank_test_score
0,RandomForestClassifier(random_state=13),10.0,75,,0.958,0.021,4
1,RandomForestClassifier(random_state=13),10.0,100,,0.958,0.021,4
2,RandomForestClassifier(random_state=13),10.0,125,,0.958,0.021,4
3,RandomForestClassifier(random_state=13),10.0,150,,0.96,0.02,1
4,RandomForestClassifier(random_state=13),15.0,75,,0.958,0.021,4
5,RandomForestClassifier(random_state=13),15.0,100,,0.958,0.021,4
6,RandomForestClassifier(random_state=13),15.0,125,,0.958,0.021,4
7,RandomForestClassifier(random_state=13),15.0,150,,0.96,0.02,1
8,RandomForestClassifier(random_state=13),25.0,75,,0.958,0.021,4
9,RandomForestClassifier(random_state=13),25.0,100,,0.958,0.021,4


## 4. Hyperparameter Tuning and Feature Selection: