In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.tree import DecisionTreeClassifier

In [3]:
BreastCancer = pd.read_csv("BreastCancer.csv")

In [41]:
print(BreastCancer.shape)
print(BreastCancer.head())

(569, 33)
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

      ...       texture_worst  perimeter_worst  area

In [56]:
data = BreastCancer.iloc[:,:-1].dropna()
print(data.shape)
X = data.iloc[:,2:]
print(X.head())
y = pd.get_dummies(data["diagnosis"]).M
print(y.head())

(569, 32)
   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   compactness_mean  concavity_mean  concave points_mean  symmetry_mean  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   fractal_dimension_mean           ...             radius_worst  \
0             

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=1)
logit1 = LogisticRegression()

logit1.fit(X_train, y_train)
pred = logit1.predict(X_test)

print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

cv_auc_logit = cross_val_score(logit1, X, y, cv=5, scoring="roc_auc")
print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc_logit)+"\n")

LogitaucMean = cv_auc_logit.mean()
print("Mean AUC scores computed using 5-fold cross-validation: {}".format(LogitaucMean)+"\n")

cv_prec_logit = cross_val_score(logit1, X, y, cv=5, scoring="precision")
print("Precision computed using 5-fold cross-validation: {}".format(cv_prec_logit)+"\n")

LogitprecMean = cv_prec_logit.mean()
print("Mean Precision computed using 5-fold cross-validation: {}".format(LogitprecMean))

[[142   6]
 [  7  73]]
             precision    recall  f1-score   support

          0       0.95      0.96      0.96       148
          1       0.92      0.91      0.92        80

avg / total       0.94      0.94      0.94       228

AUC scores computed using 5-fold cross-validation: [0.99386305 0.99192506 0.99664655 0.98289738 0.99664655]

Mean AUC scores computed using 5-fold cross-validation: 0.9923957179771132

Precision computed using 5-fold cross-validation: [0.97297297 0.95       0.97560976 0.92857143 0.93181818]

Mean Precision computed using 5-fold cross-validation: 0.9517944678920289


In [89]:
np.random.seed(1)
param_dist = {"max_depth": np.random.randint(1, 33, size=32), "max_features": np.random.randint(1, 33, size=32), "min_samples_leaf": np.random.randint(1, 33, size=32), "criterion": ["gini", "entropy"]}
tree = DecisionTreeClassifier()
treecv = RandomizedSearchCV(tree, param_dist, cv=5, random_state=42)

treecv.fit(X, y)

print("Tuned Decision Tree Parameters: {}".format(treecv.best_params_)+"\n")
print("Best score is {}".format(treecv.best_score_))

Tuned Decision Tree Parameters: {'min_samples_leaf': 10, 'max_features': 14, 'max_depth': 10, 'criterion': 'entropy'}

Best score is 0.9472759226713533


In [92]:
tree1 = DecisionTreeClassifier(min_samples_leaf=10, max_features=14, max_depth=10, criterion="entropy")

tree1.fit(X_train, y_train)
pred = tree1.predict(X_test)

print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

cv_auc_tree = cross_val_score(tree1, X, y, cv=5, scoring="roc_auc")
print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc_tree)+"\n")

TreeaucMean = cv_auc_tree.mean()
print("Mean AUC scores computed using 5-fold cross-validation: {}".format(LogitaucMean)+"\n")

cv_prec_logit = cross_val_score(logit1, X, y, cv=5, scoring="precision")
print("Precision computed using 5-fold cross-validation: {}".format(cv_prec_logit)+"\n")

LogitprecMean = cv_prec_logit.mean()
print("Mean Precision computed using 5-fold cross-validation: {}".format(LogitprecMean))

[[139   9]
 [ 12  68]]
             precision    recall  f1-score   support

          0       0.92      0.94      0.93       148
          1       0.88      0.85      0.87        80

avg / total       0.91      0.91      0.91       228

AUC scores computed using 5-fold cross-validation: [0.9622093  0.937177   0.9619383  0.97702884 0.96026157]

