# Problem Statement and Brief Summary

# Data Ingestion and EDA

In [278]:
import pandas as pd
import numpy as np

In [279]:
# Loading the dataset
df = pd.read_csv('d:/git/Data-Challenges/Breast Cancer Detection/Copy of breast-cancer-wisconsin.txt', index_col='Index')
df.tail()

Unnamed: 0_level_0,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15850,1169049,7,3,4,4,3,3,3,2,7,4
15851,1076352,3,6,4,10,3,3,3,4,1,4
15852,1107684,6,10,5,5,4,10,6,10,1,4
15853,1111249,10,6,6,3,4,5,3,6,1,4
15854,1106829,7,8,7,2,4,8,3,8,2,4


In [280]:
# Checking the data type of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15855 entries, 0 to 15854
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   ID                           15855 non-null  int64 
 1   Clump Thickness              15855 non-null  int64 
 2   Uniformity of Cell Size      15827 non-null  object
 3   Uniformity of Cell Shape     15827 non-null  object
 4   Marginal Adhesion            15827 non-null  object
 5   Single Epithelial Cell Size  15827 non-null  object
 6   Bare Nuclei                  15827 non-null  object
 7   Bland Chromatin              15827 non-null  object
 8   Normal Nucleoli              15827 non-null  object
 9   Mitoses                      15827 non-null  object
 10  Class                        15827 non-null  object
dtypes: int64(2), object(9)
memory usage: 1.5+ MB


In [281]:
# Coerce non-numeric to NA
df = df.apply(lambda x: pd.to_numeric(x,errors='coerce') if x.dtype=='object' else x)

In [282]:
# Checking the distribution of Class
df.Class.value_counts(dropna=False)

4.0     15164
2.0       456
40.0      151
NaN        79
20.0        5
Name: Class, dtype: int64

There are two types of class which are '4' and '2'. It seems like many values encoded wrongly as 40 and 20. Let correct these classes

In [283]:
# Replace 4 and 2 where class is 40 or 20
df.loc[df['Class']==40, 'Class'] = 4
df.loc[df['Class']==20, 'Class'] = 2

In [284]:
# And replace class where 4=0 and 2=1 to indicate a detection of breast cancer
df.loc[df['Class']==4, 'Class'] = 0
df.loc[df['Class']==2, 'Class'] = 1

In [285]:
# Regarding 79 cases of NA in classification which we don't know the true class, it is not reasonable to impute these missing
# Let drop missing classes
df.dropna(subset=['Class'], inplace=True)
df.shape

(15776, 11)

In [286]:
df.describe()

Unnamed: 0,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,15776.0,15776.0,15776.0,15776.0,15776.0,15776.0,15760.0,15776.0,15776.0,15776.0,15776.0
mean,1125772.0,8.257987,7.475849,6.091405,5.489351,4.597807,7.004822,5.019397,5.40644,1.829361,0.029222
std,994953.2,7.374578,6.80088,5.523121,6.109255,4.832097,7.058168,4.856222,5.874417,2.857843,0.168432
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,1076352.0,7.0,5.0,4.0,3.0,3.0,3.0,3.0,3.0,1.0,0.0
50%,1111249.0,8.0,6.0,5.0,4.0,4.0,8.0,4.0,5.0,1.0,0.0
75%,1198641.0,10.0,10.0,7.0,7.0,4.0,10.0,7.0,8.0,2.0,0.0
max,13454350.0,100.0,100.0,100.0,100.0,100.0,100.0,70.0,100.0,70.0,1.0


### outliers , correlation , remove duplications steps before fillna with mean()

In [287]:
df.corr()

Unnamed: 0,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
ID,1.0,0.798615,0.798907,0.790098,0.659632,0.675996,0.779951,0.800855,0.669345,0.610312,0.018211
Clump Thickness,0.798615,1.0,0.79825,0.807202,0.596727,0.749961,0.736488,0.77732,0.658163,0.497002,-0.116305
Uniformity of Cell Size,0.798907,0.79825,1.0,0.901479,0.648359,0.702182,0.764694,0.814071,0.874015,0.3659,-0.152814
Uniformity of Cell Shape,0.790098,0.807202,0.901479,1.0,0.572345,0.67748,0.76199,0.74987,0.752936,0.431557,-0.140997
Marginal Adhesion,0.659632,0.596727,0.648359,0.572345,1.0,0.818135,0.519946,0.65194,0.520169,0.392536,-0.111973
Single Epithelial Cell Size,0.675996,0.749961,0.702182,0.67748,0.818135,1.0,0.498035,0.706558,0.589639,0.371526,-0.081984
Bare Nuclei,0.779951,0.736488,0.764694,0.76199,0.519946,0.498035,1.0,0.736532,0.559522,0.327993,-0.134042
Bland Chromatin,0.800855,0.77732,0.814071,0.74987,0.65194,0.706558,0.736532,1.0,0.671661,0.396118,-0.096562
Normal Nucleoli,0.669345,0.658163,0.874015,0.752936,0.520169,0.589639,0.559522,0.671661,1.0,0.2959,-0.118101
Mitoses,0.610312,0.497002,0.3659,0.431557,0.392536,0.371526,0.327993,0.396118,0.2959,1.0,-0.040606


In [288]:
# Replace missing values with the column's mean
df = df.apply(lambda x: x.fillna(x.mean()), axis=0)
df.isnull().sum()

ID                             0
Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

In [289]:
# The dataset contain many duplate IDs with the same row values, let drop these
# data.drop_duplicates(inplace=True)

# Feature Engineering

In [307]:
df.head()

Unnamed: 0_level_0,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1241035,7,8.0,3.0,7.0,4.0,5.0,7.0,8.0,2.0,0.0
1,1107684,6,10.0,5.0,5.0,4.0,10.0,6.0,10.0,1.0,0.0
2,691628,8,6.0,4.0,10.0,10.0,1.0,3.0,5.0,1.0,0.0
3,1226612,7,5.0,6.0,3.0,3.0,8.0,7.0,4.0,1.0,0.0
4,1142706,5,10.0,10.0,10.0,6.0,10.0,6.0,5.0,2.0,0.0


# Model Selection and Optimization

In [291]:
# Splitting the dataset into features and target
X = df.iloc[:, 1:10].values
Y = df.iloc[:, -1].values

In [292]:
# Splitting the data into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, stratify=Y)

In [293]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [294]:
def models(X_train,Y_train):
  
  #Using Logistic Regression 
  from sklearn.linear_model import LogisticRegression
  log = LogisticRegression(random_state = 0)
  log.fit(X_train, Y_train)
  
  #Using KNeighborsClassifier 
  from sklearn.neighbors import KNeighborsClassifier
  knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
  knn.fit(X_train, Y_train)

  #Using SVC linear
  from sklearn.svm import SVC
  svc_lin = SVC(kernel = 'linear', random_state = 0)
  svc_lin.fit(X_train, Y_train)

  #Using SVC rbf
  from sklearn.svm import SVC
  svc_rbf = SVC(kernel = 'rbf', random_state = 0)
  svc_rbf.fit(X_train, Y_train)

  #Using GaussianNB 
  from sklearn.naive_bayes import GaussianNB
  gauss = GaussianNB()
  gauss.fit(X_train, Y_train)

  #Using DecisionTreeClassifier 
  from sklearn.tree import DecisionTreeClassifier
  tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
  tree.fit(X_train, Y_train)

  #Using RandomForestClassifier method of ensemble class to use Random Forest Classification algorithm
  from sklearn.ensemble import RandomForestClassifier
  forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
  forest.fit(X_train, Y_train)
  
  #print model accuracy on the training data.
  print('[0]Logistic Regression Training Accuracy:', log.score(X_train, Y_train))
  print('[1]K Nearest Neighbor Training Accuracy:', knn.score(X_train, Y_train))
  print('[2]Support Vector Machine (Linear Classifier) Training Accuracy:', svc_lin.score(X_train, Y_train))
  print('[3]Support Vector Machine (RBF Classifier) Training Accuracy:', svc_rbf.score(X_train, Y_train))
  print('[4]Gaussian Naive Bayes Training Accuracy:', gauss.score(X_train, Y_train))
  print('[5]Decision Tree Classifier Training Accuracy:', tree.score(X_train, Y_train))
  print('[6]Random Forest Classifier Training Accuracy:', forest.score(X_train, Y_train))
  
  return log, knn, svc_lin, svc_rbf, gauss, tree, forest

In [295]:
model = models(X_train,Y_train)

[0]Logistic Regression Training Accuracy: 0.998394185260311
[1]K Nearest Neighbor Training Accuracy: 0.9986477349560514
[2]Support Vector Machine (Linear Classifier) Training Accuracy: 0.9979716024340771
[3]Support Vector Machine (RBF Classifier) Training Accuracy: 0.9984787018255578
[4]Gaussian Naive Bayes Training Accuracy: 0.9978870858688302
[5]Decision Tree Classifier Training Accuracy: 1.0
[6]Random Forest Classifier Training Accuracy: 0.9999154834347532


In [296]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

for i in range(len(model)):
  print('Model ',i)
  #Check precision, recall, f1-score
  print( classification_report(Y_test, model[i].predict(X_test)) )
  #Another way to get the models accuracy on the test data
  print( accuracy_score(Y_test, model[i].predict(X_test)))
  print()#Print a new line

Model  0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3829
         1.0       1.00      0.93      0.96       115

    accuracy                           1.00      3944
   macro avg       1.00      0.97      0.98      3944
weighted avg       1.00      1.00      1.00      3944

0.9979716024340771

Model  1
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3829
         1.0       0.99      0.93      0.96       115

    accuracy                           1.00      3944
   macro avg       0.99      0.97      0.98      3944
weighted avg       1.00      1.00      1.00      3944

0.9977180527383367

Model  2
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3829
         1.0       0.99      0.96      0.97       115

    accuracy                           1.00      3944
   macro avg       0.99      0.98      0.99      3944
weighte

## Logistic regression and feature selection

In [297]:
from sklearn.linear_model import LogisticRegression

# Specify L1 regularization
lr = LogisticRegression(penalty='l1', solver='liblinear')

In [298]:
from sklearn.model_selection import GridSearchCV

# Instantiate the GridSearchCV object and run the search
searcher = GridSearchCV(lr, {'C': [0.001, 0.01, 0.1, 1, 10]})
searcher.fit(X_train, Y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l1',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [299]:
# Report the best parameters
print("Best CV params", searcher.best_params_)

Best CV params {'C': 1}


In [300]:
# Find the number of nonzero coefficients (selected features)
best_lr = searcher.best_estimator_
coefs = best_lr.coef_
print("Total number of features:", coefs.size)
print("Number of selected features: ", np.count_nonzero(coefs))

Total number of features: 9
Number of selected features:  9


In [301]:
temp = pd.DataFrame( df.columns[1:10].to_list(), coefs.tolist())

In [302]:
temp.reset_index().sort_values(by='level_0')

Unnamed: 0,level_0,0
1,-4.314212,Uniformity of Cell Size
3,-3.259217,Marginal Adhesion
5,-2.800212,Bare Nuclei
2,-2.015684,Uniformity of Cell Shape
8,-1.469105,Mitoses
0,-1.13596,Clump Thickness
7,-0.73828,Normal Nucleoli
6,0.280426,Bland Chromatin
4,0.86609,Single Epithelial Cell Size


In [303]:
t2 = pd.DataFrame( df.columns[1:10].to_list(), coefs[0])

In [304]:
t2

Unnamed: 0,0
-1.13596,Clump Thickness
-4.314212,Uniformity of Cell Size
-2.015684,Uniformity of Cell Shape
-3.259217,Marginal Adhesion
0.86609,Single Epithelial Cell Size
-2.800212,Bare Nuclei
0.280426,Bland Chromatin
-0.73828,Normal Nucleoli
-1.469105,Mitoses


In [305]:
coefs[0]

array([-1.13595986, -4.31421161, -2.01568422, -3.25921702,  0.86609011,
       -2.80021169,  0.28042631, -0.73828027, -1.46910523])