# Imports

In [1]:
# Import pandas and numpy
import pandas as pd
import numpy as np

# Import the classifiers we will be using
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Import train/test split function
from sklearn.model_selection import train_test_split

# Import cross validation scorer
from sklearn.model_selection import cross_val_score

# Import ROC AUC scoring function
from sklearn.metrics import roc_auc_score

from sklearn import preprocessing

# Read the data
#### This a breast cancer diagnostic dataset: these features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. 
#### "diagnosis" is our target: 0 for benign, 1 for malignant.

In [4]:
# Read in our dataset, using the parameter 'index_col' to select the index
df =  pd.read_csv('../data/breast_cancer.csv', index_col='id')

In [13]:
# Let's see the header
df.head(n=3)
df.columns


Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [8]:
# And the shape
df.shape

(569, 31)

In [11]:
# Assign the features and the target
features = df.drop('diagnosis',axis=1)
target = df['diagnosis']
features.


# Train/test split

In [12]:
# Create the train/test split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0)

# Modelling with standard train/test split


### DecisionTreeClassifier

In [None]:
# Choose the model
tree_model = DecisionTreeClassifier(random_state=0)

# Fit the model
tree_model.fit(X_train, y_train)

# Make the predictions
y_pred = tree_model.predict_proba(X_test)

# Score the predictions
score = roc_auc_score(y_test, y_pred[:,1])

print("ROC AUC: " + str(score))

print("Number of mislabeled points out of a total %d points: %d" % (y_test.shape[0],(y_test != np.round_(y_pred[:,1])).sum()))

In [None]:
# Choose the K-Neareast Neighbors model

# Fit the model

# Make the predictions

# Score the predictions

# Print the score

In [None]:
# Choose the Naive Bayes model

# Fit the model

# Make the predictions

# Score the predictions

# Print the score

In [None]:
# Choose the Random Forest model

# Fit the model

# Make the predictions

# Score the predictions

# Print the score

# Modelling with k-fold cross validation

In [None]:
# Choose the Decision Tree model

# Fit, predict and score in one step, using cross_val_score()

# Print the scores

# Print the mean score

In [None]:
# Choose the K-Neareast Neighbors model

# Fit, predict and score in one step, using cross_val_score()

# Print the scores

# Print the mean score

In [None]:
# Choose the Naive Bayes model

# Fit, predict and score in one step, using cross_val_score()

# Print the scores

# Print the mean score

In [None]:
# Choose the Random Forest model

# Fit, predict and score in one step, using cross_val_score()

# Print the scores

# Print the mean score

# Bonus exercise:
#### Check the documentation for each of the models you used.
#### Try different hyperparameters, and see if you can improve the score!
#### Some ideas:
#### - Tune the number of neighbors in the K-Nearest Neighbors model.
#### - Try balancing the class weight, and the maximum depth on the Decision tree and Random forest models.