In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Machine learning packages
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import confusion_matrix

# About the Data

Our dataset for this exercise was obtained from the UCI Machine Learning Repository, and we will be evaluating a dataset with information about breast cancer in Madison, Wisconsin. Our dataset contains a list of 699 patients and a somewhat condensed dataset of columns detailing the patients' 
1. Sample code number: id number 
2. Clump Thickness: 1 - 10 
3. Uniformity of Cell Size: 1 - 10 
4. Uniformity of Cell Shape: 1 - 10 
5. Marginal Adhesion: 1 - 10 
6. Single Epithelial Cell Size: 1 - 10 
7. Bare Nuclei: 1 - 10 
8. Bland Chromatin: 1 - 10 
9. Normal Nucleoli: 1 - 10 
10. Mitoses: 1 - 10 
11. Class: (2 for benign, 4 for malignant)

In [2]:
# Importing our data
data = pd.read_csv('Data\\breast-cancer-wisconsin.data', delimiter=',', header =None, encoding ='latin8')
data.columns = ['id_number', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape', 'marginal_adhesion',
             'single_epithetial_cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']

# Removes none-numeric entries from bare_nuclei
data = data[data.bare_nuclei.apply(lambda x: x.isnumeric())]
data['bare_nuclei'] = data['bare_nuclei'].astype(np.int64)

In [3]:
# Prints the size of our dataset
data.shape

(683, 11)

In [4]:
data = data.dropna()

In [22]:
data.isnull().sum()

id_number                      0
clump_thickness                0
uniformity_cell_size           0
uniformity_cell_shape          0
marginal_adhesion              0
single_epithetial_cell_size    0
bare_nuclei                    0
bland_chromatin                0
normal_nucleoli                0
mitoses                        0
class                          0
dtype: int64

In [23]:
Y = np.where(data['class'] > 30, 1, 0)
X = data.drop(data['class'])

In [24]:
X.shape

(681, 11)

In [25]:
Y.shape

(683,)

In [7]:
data.describe()

Unnamed: 0,id_number,clump_thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_epithetial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,1076720.0,4.442167,3.150805,3.215227,2.830161,3.234261,3.544656,3.445095,2.869693,1.603221,2.699854
std,620644.0,2.820761,3.065145,2.988581,2.864562,2.223085,3.643857,2.449697,3.052666,1.732674,0.954592
min,63375.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,877617.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,1171795.0,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,1238705.0,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [8]:
# Split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

In [9]:
# Constructing our logistic regression model
lr = LogisticRegression(C = 1e9)

# Checking the accuracy of our model using cross validation
cross_val_score(lr, X_train, y_train, cv=5)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [None]:
# Print our data types
data.dtypes

In [None]:
# Print our data types
data.dtypes