In [1]:
import matplotlib.pyplot as plt 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 


%matplotlib inline

##### Source: https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)

##### Creator: 
- Dr. William H. Wolberg, General Surgery Dept. 
 University of Wisconsin Hospitals 
 Madison, Wisconsin, USA. 
 * wolberg '@' eagle.surgery.wisc.edu 

##### Donor: 
- Olvi Mangasarian (mangasarian '@' cs.wisc.edu) 
- Received by David W. Aha (aha '@' cs.jhu.edu)

###### Attribute Information:
1. Sample code number: id number 
2. Clump Thickness: 1 - 10 
3. Uniformity of Cell Size: 1 - 10 
4. Uniformity of Cell Shape: 1 - 10 
5. Marginal Adhesion: 1 - 10 
6. Single Epithelial Cell Size: 1 - 10 
7. Bare Nuclei: 1 - 10 
8. Bland Chromatin: 1 - 10 
9. Normal Nucleoli: 1 - 10 
10. Mitoses: 1 - 10 
11. Class: (2 for benign, 4 for malignant)

Date Donated
1992-07-15

In [2]:
cancer=pd.read_csv('Cleaning-breast-cancer-wisconsin - breast-cancer-wisconsin.csv', delimiter=',')
cancer.head(6)

Unnamed: 0,id,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4


In [3]:
cancer.drop(['id'], axis=1, inplace=True)
cancer.head(5)

Unnamed: 0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [4]:
# Review number of columns of each data type in a DataFrame:
cancer.get_dtype_counts()

int64    10
dtype: int64

In [5]:
# Find missing values
print('Missing values:{}'.format(cancer['bare_nucleoli'].isnull().any().sum()))
# Find duplicated records
print('\nNumber of duplicated records: {}'.format(cancer.duplicated().sum()))

# Find the unique values of 'diagnosis'.
print('\nUnique values of "diagnosis": {}'.format(cancer['class'].unique()))

Missing values:0

Number of duplicated records: 234

Unique values of "diagnosis": [2 4]


In [6]:
cancer.shape

(683, 10)

In [7]:
# Review number of columns of each data type in a DataFrame:
cancer.get_dtype_counts()

int64    10
dtype: int64

In [8]:
cancer.to_csv('Cleanning Samples wisconsin dataset.csv', encoding='utf-8',sep=',',index=False)

In [9]:
cancer=pd.read_csv('Cleanning Samples wisconsin dataset.csv', delimiter=',')

In [10]:
cancer.head()

Unnamed: 0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [11]:
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
svm = SVC()
#X=cancer[['clump_thickness','size_uniformity','shape_uniformity','marginal_adhesion','epithelial_size','bland_chromatin','normal_nucleoli','mitoses']]
X=cancer[['clump_thickness','size_uniformity','shape_uniformity','marginal_adhesion','epithelial_size','bare_nucleoli','bland_chromatin','normal_nucleoli','mitoses']]

y=cancer['class']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)
print("* The number of the rows in the data features:",y.count())
print("* After splitting the data into 75% for tranning and 25% for testing ")
print("* The number of the rows in the train data features:",y_train.count(),"Which are about ",(y_train.count()/y.count())*100)
print("* The number of the rows in the test data features:",y_test.count(),"Which are about ",(y_test.count()/y.count())*100)

* The number of the rows in the data features: 683
* After splitting the data into 75% for tranning and 25% for testing 
* The number of the rows in the train data features: 512 Which are about  74.96339677891655
* The number of the rows in the test data features: 171 Which are about  25.036603221083453


In [12]:
svm.fit(X_train, y_train)
print('The accuracy on the training subset: {:.3f}'.format(svm.score(X_train, y_train)))
print('The accuracy on the test subset: {:.3f}'.format(svm.score(X_test, y_test)))

The accuracy on the training subset: 0.998
The accuracy on the test subset: 0.953


In [13]:
min_train = X_train.min(axis=0)
range_train = (X_train - min_train).max(axis=0)

X_train_scaled = (X_train - min_train)/range_train

In [14]:
X_test_scaled = (X_test - min_train)/range_train

svm = SVC()
svm.fit(X_train_scaled, y_train)

print('The accuracy on the training subset: {:.3f}'.format(svm.score(X_train_scaled, y_train)))
print('The accuracy on the test subset: {:.3f}'.format(svm.score(X_test_scaled, y_test)))

The accuracy on the training subset: 0.979
The accuracy on the test subset: 0.942


In [15]:
svm = SVC(C=130)
svm.fit(X_train_scaled, y_train)

print('The accuracy on the training subset: {:.3f}'.format(svm.score(X_train_scaled, y_train)))
print('The accuracy on the test subset: {:.3f}'.format(svm.score(X_test_scaled, y_test)))

The accuracy on the training subset: 0.982
The accuracy on the test subset: 0.959


In [16]:
cancer=pd.read_csv('Cleanning Samples wisconsin dataset.csv', delimiter=',')

X=cancer[['clump_thickness','size_uniformity','shape_uniformity','marginal_adhesion','epithelial_size','bare_nucleoli','bland_chromatin','normal_nucleoli','mitoses']]

y=cancer['class']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)
min_train = X_train.min(axis=0)
range_train = (X_train - min_train).max(axis=0)

X_train_scaled = (X_train - min_train)/range_train

X_test_scaled = (X_test - min_train)/range_train

svm = SVC(C=130)
svm.fit(X_train_scaled, y_train)

print('The accuracy on the training subset: {:.3f}'.format(svm.score(X_train_scaled, y_train)))
print('The accuracy on the test subset: {:.3f}'.format(svm.score(X_test_scaled, y_test)))


The accuracy on the training subset: 0.982
The accuracy on the test subset: 0.959


## Uncertanity estimation

In [17]:
# The decision Function

print('the Decision function is:\n\n{}'.format(svm.decision_function(X_test_scaled[:30])))

the Decision function is:

[-2.00233884 -2.75603302  1.25620553  1.39897719 -3.24107141 -2.64802067
 -2.05952383  2.32573949 -2.50220417 -3.19793048  1.84886531 -2.24216814
  1.65810704 -2.75603302 -2.21319816 -0.20284239  1.68030804  1.19115649
  2.3634972  -1.70033578 -2.75603302 -2.50795642  0.63660602 -2.79190341
  2.27675734  1.53928723 -2.10607782 -3.03953642 -2.22523521  1.87138718]


In [19]:
x={'clump_thickness':1,'size_uniformity':2,'shape_uniformity':3,'marginal_adhesion':4 ,'epithelial_size':5}
import numpy as np
prediction=svm.predict(np.array([[1,2,3,4,5,6,7,8,9]]))
for i in range(1):
    if(prediction[i]==2):
        print("The tumor is begnin")     

    elif(prediction[i]==4):
        print("The tumor is malignant") 
print('the Decision function estimation is:{}'.format(svm.decision_function(X_test_scaled[:1])))

The tumor is malignant
the Decision function estimation is:[-2.00233884]


In [20]:
x=np.array([[1,2,3,4,5,6,7,8]])
x

array([[1, 2, 3, 4, 5, 6, 7, 8]])

In [21]:
cancer=pd.read_csv('Cleaning-breast-cancer-wisconsin - breast-cancer-wisconsin.csv', delimiter=',')
cancer.head(6)

Unnamed: 0,id,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4


In [22]:
cancer.drop(['id'], axis=1, inplace=True)
cancer.head(5)

Unnamed: 0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [23]:
# Review number of columns of each data type in a DataFrame:
cancer.get_dtype_counts()

int64    10
dtype: int64

In [24]:
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
svm = SVC()
#X=cancer[['clump_thickness','size_uniformity','shape_uniformity','marginal_adhesion','epithelial_size','bland_chromatin','normal_nucleoli','mitoses']]
X=cancer[['clump_thickness','size_uniformity','shape_uniformity','marginal_adhesion','epithelial_size','bare_nucleoli','bland_chromatin','normal_nucleoli','mitoses']]

y=cancer['class']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)
print("* The number of the rows in the data features:",y.count())
print("* After splitting the data into 75% for tranning and 25% for testing ")
print("* The number of the rows in the train data features:",y_train.count(),"Which are about ",(y_train.count()/y.count())*100)
print("* The number of the rows in the test data features:",y_test.count(),"Which are about ",(y_test.count()/y.count())*100)

* The number of the rows in the data features: 683
* After splitting the data into 75% for tranning and 25% for testing 
* The number of the rows in the train data features: 512 Which are about  74.96339677891655
* The number of the rows in the test data features: 171 Which are about  25.036603221083453


In [25]:
svm.fit(X_train, y_train)
print('The accuracy on the training subset: {:.3f}'.format(svm.score(X_train, y_train)))
print('The accuracy on the test subset: {:.3f}'.format(svm.score(X_test, y_test)))

The accuracy on the training subset: 0.998
The accuracy on the test subset: 0.953


In [26]:
min_train = X_train.min(axis=0)
range_train = (X_train - min_train).max(axis=0)

X_train_scaled = (X_train - min_train)/range_train

In [27]:
X_test_scaled = (X_test - min_train)/range_train

svm = SVC()
svm.fit(X_train_scaled, y_train)

print('The accuracy on the training subset: {:.3f}'.format(svm.score(X_train_scaled, y_train)))
print('The accuracy on the test subset: {:.3f}'.format(svm.score(X_test_scaled, y_test)))

The accuracy on the training subset: 0.979
The accuracy on the test subset: 0.942


In [28]:
svm = SVC(C=100)
svm.fit(X_train_scaled, y_train)

print('The accuracy on the training subset: {:.3f}'.format(svm.score(X_train_scaled, y_train)))
print('The accuracy on the test subset: {:.3f}'.format(svm.score(X_test_scaled, y_test)))

The accuracy on the training subset: 0.982
The accuracy on the test subset: 0.959


In [29]:
cancer=pd.read_csv('Cleaning-breast-cancer-wisconsin - breast-cancer-wisconsin.csv', delimiter=',')
cancer.drop(['id'], axis=1, inplace=True)


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)
X=cancer[['clump_thickness','size_uniformity','shape_uniformity','marginal_adhesion','epithelial_size','bare_nucleoli','bland_chromatin','normal_nucleoli','mitoses']]

y=cancer['class']

In [31]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [32]:
print('The accuracy on the training subset: {:.3f}'.format(clf.score(X_train, y_train)))
print('The accuracy on the test subset: {:.3f}'.format(clf.score(X_test,y_test)))

The accuracy on the training subset: 1.000
The accuracy on the test subset: 0.953


In [43]:
cancer=pd.read_csv('Cleaning-breast-cancer-wisconsin - breast-cancer-wisconsin.csv', delimiter=',')
cancer.drop(['id'], axis=1, inplace=True)
cancer.head()

Unnamed: 0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [44]:
cancer.to_csv('prepared_sample_dataest.csv', encoding='utf-8',sep=',',index=False)

In [45]:
sample_dataset=pd.read_csv('prepared_sample_dataest.csv', delimiter=',')


In [46]:
sample_dataset.head()

Unnamed: 0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [47]:
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
svm = SVC()
X=cancer[['clump_thickness','size_uniformity','shape_uniformity','marginal_adhesion','epithelial_size','bare_nucleoli','bland_chromatin','normal_nucleoli','mitoses']]

y=cancer['class']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

In [48]:
min_train = X_train.min(axis=0)
range_train = (X_train - min_train).max(axis=0)

X_train_scaled = (X_train - min_train)/range_train

In [49]:
svm = SVC(C=100)
svm.fit(X_train_scaled, y_train)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [50]:
print('The accuracy on the training subset: {:.3f}'.format(svm.score(X_train_scaled, y_train)))
print('The accuracy on the test subset: {:.3f}'.format(svm.score(X_test_scaled, y_test)))

The accuracy on the training subset: 0.982
The accuracy on the test subset: 0.959


In [91]:
'''KNN'''
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
sample_svm=pd.read_csv('prepared_sample_dataest.csv', delimiter=',')
X=sample_svm[['clump_thickness','size_uniformity','shape_uniformity','marginal_adhesion','epithelial_size','bare_nucleoli','bland_chromatin','normal_nucleoli','mitoses']]
y=sample_svm['class']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0) 
knn=knn.fit(X_train,y_train)
KnnTrainPrediction=knn.score(X_train,y_train)
KnnTestPrediction=knn.score(X_test,y_test)
print('Accuracy on the training subset: {:.3f}'.format(KnnTrainPrediction))
print('Accuracy on the test subset: {:.3f}'.format(KnnTestPrediction))
'''KNN'''

Accuracy on the training subset: 0.979
Accuracy on the test subset: 0.953


'KNN'

In [62]:
sample_dataset=pd.read_csv('prepared_sample_dataest.csv', delimiter=',')
sample_dataset.head()

Unnamed: 0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [63]:
min_X = X.min(axis=0)
X_range = (X - min_X).max(axis=0)

X = (X - min_X)/X_range

In [64]:
svm = SVC(C=100)
clf=svm.fit(X,y)

In [65]:
from sklearn.externals import joblib

In [66]:
joblib.dump(svm,'samplemodel')

['samplemodel']

In [67]:
clf=joblib.load('samplemodel')

In [69]:
data=clf.predict([[1,2,3,4,5,6,7,8,9]])
for i in range(1):
    if(prediction[i]==2):
        print("The tumor is begnin")     

    elif(prediction[i]==4):
        print("The tumor is malignant") 

The tumor is malignant
