In [90]:
import matplotlib.pyplot as plt 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 


%matplotlib inline

In [91]:
cancer_recurrence=pd.read_csv('cancer_recurrence.csv', delimiter=',')
cancer_recurrence.head()

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,premefalse,15-19,0-2,True,3,right,left_up,False,recurrence-events
1,50-59,ge40,15-19,0-2,False,1,right,central,False,false-recurrence-events
2,50-59,ge40,35-39,0-2,False,2,left,left_low,False,recurrence-events
3,40-49,premefalse,35-39,0-2,True,3,right,left_low,True,false-recurrence-events
4,40-49,premefalse,30-34,3-5,True,2,left,right_up,False,recurrence-events


In [92]:
# Find missing values
print('Missing values:{}'.format(cancer_recurrence.isnull().any().sum()))
# Find duplicated records
print('\nNumber of duplicated records: {}'.format(cancer_recurrence.duplicated().sum()))

# Find the unique values of 'diagnosis'.

Missing values:0

Number of duplicated records: 0


In [93]:
# Review number of columns of each data type in a DataFrame:
cancer_recurrence.get_dtype_counts()

int64     1
bool      2
object    7
dtype: int64

In [94]:
cancer_recurrence.shape

(263, 10)

In [95]:
cancer_recurrence.head()

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,premefalse,15-19,0-2,True,3,right,left_up,False,recurrence-events
1,50-59,ge40,15-19,0-2,False,1,right,central,False,false-recurrence-events
2,50-59,ge40,35-39,0-2,False,2,left,left_low,False,recurrence-events
3,40-49,premefalse,35-39,0-2,True,3,right,left_low,True,false-recurrence-events
4,40-49,premefalse,30-34,3-5,True,2,left,right_up,False,recurrence-events


In [96]:
cancer_recurrence['mefalsepause'] = cancer_recurrence['mefalsepause'].map({'premefalse':1,'ge40':0,'lt40':3})

In [97]:
cancer_recurrence.head(100)

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,1,15-19,0-2,True,3,right,left_up,False,recurrence-events
1,50-59,0,15-19,0-2,False,1,right,central,False,false-recurrence-events
2,50-59,0,35-39,0-2,False,2,left,left_low,False,recurrence-events
3,40-49,1,35-39,0-2,True,3,right,left_low,True,false-recurrence-events
4,40-49,1,30-34,3-5,True,2,left,right_up,False,recurrence-events
5,50-59,1,25-29,3-5,False,2,right,left_up,True,false-recurrence-events
6,50-59,0,40-44,0-2,False,3,left,left_up,False,false-recurrence-events
7,40-49,1,10-14,0-2,False,2,left,left_up,False,false-recurrence-events
8,40-49,1,0-4,0-2,False,2,right,right_low,False,false-recurrence-events
9,40-49,0,40-44,15-17,True,2,right,left_up,True,false-recurrence-events


In [98]:
cancer_recurrence['falsede-caps'] = cancer_recurrence['falsede-caps'].map({True:2,False:3})
cancer_recurrence.head(10)

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,1,15-19,0-2,2,3,right,left_up,False,recurrence-events
1,50-59,0,15-19,0-2,3,1,right,central,False,false-recurrence-events
2,50-59,0,35-39,0-2,3,2,left,left_low,False,recurrence-events
3,40-49,1,35-39,0-2,2,3,right,left_low,True,false-recurrence-events
4,40-49,1,30-34,3-5,2,2,left,right_up,False,recurrence-events
5,50-59,1,25-29,3-5,3,2,right,left_up,True,false-recurrence-events
6,50-59,0,40-44,0-2,3,3,left,left_up,False,false-recurrence-events
7,40-49,1,10-14,0-2,3,2,left,left_up,False,false-recurrence-events
8,40-49,1,0-4,0-2,3,2,right,right_low,False,false-recurrence-events
9,40-49,0,40-44,15-17,2,2,right,left_up,True,false-recurrence-events


In [99]:
cancer_recurrence['breast'] = cancer_recurrence['breast'].map({'right':4,'left':6})
cancer_recurrence.head(10)

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,1,15-19,0-2,2,3,4,left_up,False,recurrence-events
1,50-59,0,15-19,0-2,3,1,4,central,False,false-recurrence-events
2,50-59,0,35-39,0-2,3,2,6,left_low,False,recurrence-events
3,40-49,1,35-39,0-2,2,3,4,left_low,True,false-recurrence-events
4,40-49,1,30-34,3-5,2,2,6,right_up,False,recurrence-events
5,50-59,1,25-29,3-5,3,2,4,left_up,True,false-recurrence-events
6,50-59,0,40-44,0-2,3,3,6,left_up,False,false-recurrence-events
7,40-49,1,10-14,0-2,3,2,6,left_up,False,false-recurrence-events
8,40-49,1,0-4,0-2,3,2,4,right_low,False,false-recurrence-events
9,40-49,0,40-44,15-17,2,2,4,left_up,True,false-recurrence-events


In [100]:
cancer_recurrence['breast-quad'] = cancer_recurrence['breast-quad'].map({'left_up':7,'left_low':1,'right_up':9,'right_low':3,'central':5})
cancer_recurrence.head(10)

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,1,15-19,0-2,2,3,4,7,False,recurrence-events
1,50-59,0,15-19,0-2,3,1,4,5,False,false-recurrence-events
2,50-59,0,35-39,0-2,3,2,6,1,False,recurrence-events
3,40-49,1,35-39,0-2,2,3,4,1,True,false-recurrence-events
4,40-49,1,30-34,3-5,2,2,6,9,False,recurrence-events
5,50-59,1,25-29,3-5,3,2,4,7,True,false-recurrence-events
6,50-59,0,40-44,0-2,3,3,6,7,False,false-recurrence-events
7,40-49,1,10-14,0-2,3,2,6,7,False,false-recurrence-events
8,40-49,1,0-4,0-2,3,2,4,3,False,false-recurrence-events
9,40-49,0,40-44,15-17,2,2,4,7,True,false-recurrence-events


In [101]:
cancer_recurrence['irradiat'] = cancer_recurrence['irradiat'].map({True:2,False:3})
cancer_recurrence.head(10)

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,1,15-19,0-2,2,3,4,7,3,recurrence-events
1,50-59,0,15-19,0-2,3,1,4,5,3,false-recurrence-events
2,50-59,0,35-39,0-2,3,2,6,1,3,recurrence-events
3,40-49,1,35-39,0-2,2,3,4,1,2,false-recurrence-events
4,40-49,1,30-34,3-5,2,2,6,9,3,recurrence-events
5,50-59,1,25-29,3-5,3,2,4,7,2,false-recurrence-events
6,50-59,0,40-44,0-2,3,3,6,7,3,false-recurrence-events
7,40-49,1,10-14,0-2,3,2,6,7,3,false-recurrence-events
8,40-49,1,0-4,0-2,3,2,4,3,3,false-recurrence-events
9,40-49,0,40-44,15-17,2,2,4,7,2,false-recurrence-events


In [102]:
cancer_recurrence['class'] = cancer_recurrence['class'].map({'recurrence-events':4,'false-recurrence-events':6})
cancer_recurrence.head(10)

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,1,15-19,0-2,2,3,4,7,3,4
1,50-59,0,15-19,0-2,3,1,4,5,3,6
2,50-59,0,35-39,0-2,3,2,6,1,3,4
3,40-49,1,35-39,0-2,2,3,4,1,2,6
4,40-49,1,30-34,3-5,2,2,6,9,3,4
5,50-59,1,25-29,3-5,3,2,4,7,2,6
6,50-59,0,40-44,0-2,3,3,6,7,3,6
7,40-49,1,10-14,0-2,3,2,6,7,3,6
8,40-49,1,0-4,0-2,3,2,4,3,3,6
9,40-49,0,40-44,15-17,2,2,4,7,2,6


In [103]:
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
svm = SVC()

X=cancer_recurrence[['age','mefalsepause','tumor-size','inv-falsedes','falsede-caps','deg-malig','breast','breast-quad','irradiat']]
#X=cancer_recurrence[['falsede-caps','deg-malig','breast','breast-quad','irradiat']]


y=cancer_recurrence['class']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)


In [104]:
print("* The number of the rows in the data features:",y.count())
print("* After splitting the data into 75% for tranning and 25% for testing ")
print("* The number of the rows in the train data features:",y_train.count(),"Which are about ",(y_train.count()/y.count())*100)
print("* The number of the rows in the test data features:",y_test.count(),"Which are about ",(y_test.count()/y.count())*100)

* The number of the rows in the data features: 263
* After splitting the data into 75% for tranning and 25% for testing 
* The number of the rows in the train data features: 197 Which are about  74.90494296577947
* The number of the rows in the test data features: 66 Which are about  25.09505703422053


In [105]:
# Review number of columns of each data type in a DataFrame:
cancer_recurrence.get_dtype_counts()
# Review number of columns of each data type in a DataFrame:
cancer_recurrence.head()

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,1,15-19,0-2,2,3,4,7,3,4
1,50-59,0,15-19,0-2,3,1,4,5,3,6
2,50-59,0,35-39,0-2,3,2,6,1,3,4
3,40-49,1,35-39,0-2,2,3,4,1,2,6
4,40-49,1,30-34,3-5,2,2,6,9,3,4


In [106]:
agecodes = {'20-29':((20+29)/2),'30-39':((30+39)/2),'40-49':((40+49)/2),'50-59':((50+59)/2),'60-69':((60+69)/2),'70-79':((70+79)/2),'80-89':((80+89)/2)}
cancer_recurrence['age'] = cancer_recurrence['age'].map(agecodes)

In [107]:
cancer_recurrence.head()

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,44.5,1,15-19,0-2,2,3,4,7,3,4
1,54.5,0,15-19,0-2,3,1,4,5,3,6
2,54.5,0,35-39,0-2,3,2,6,1,3,4
3,44.5,1,35-39,0-2,2,3,4,1,2,6
4,44.5,1,30-34,3-5,2,2,6,9,3,4


In [108]:
invfalsedes = {'0-2':((0+2)/2),'3-5':((3+5)/2),'24-26':((24+26)/2),'9-11':((9+11)/2),'12-14':((12+14)/2),'15-17':((15+17)/2),'6-8':((6+8)/2)}
cancer_recurrence['inv-falsedes'] = cancer_recurrence['inv-falsedes'].map(invfalsedes)

In [109]:
cancer_recurrence.head()

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,44.5,1,15-19,1.0,2,3,4,7,3,4
1,54.5,0,15-19,1.0,3,1,4,5,3,6
2,54.5,0,35-39,1.0,3,2,6,1,3,4
3,44.5,1,35-39,1.0,2,3,4,1,2,6
4,44.5,1,30-34,4.0,2,2,6,9,3,4


In [110]:
tumorsize = {'15-19':((15+19)/2),'35-39':((35+39)/2),'30-34':((30+34)/2),'25-29':((25+29)/2),'20-24':((20+24)/2),'0-4':((0+4)/2),'15-19':((15+19)/2),'40-44':((40+44)/2),'10-14':((10+14)/2),'5-9':((5+9)/2),'14-10':((10+14)/2),'10/14/2018':((10+14)/2),'14/10/2018':((10+14)/2),'2018/10/14':((10+14)/2),'2018/14/10':((10+14)/2)}
cancer_recurrence['tumor-size'] = cancer_recurrence['tumor-size'].map(tumorsize)

In [111]:
cancer_recurrence.head()

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,44.5,1,17.0,1.0,2,3,4,7,3,4
1,54.5,0,17.0,1.0,3,1,4,5,3,6
2,54.5,0,37.0,1.0,3,2,6,1,3,4
3,44.5,1,37.0,1.0,2,3,4,1,2,6
4,44.5,1,32.0,4.0,2,2,6,9,3,4


In [112]:
Nan_number=cancer_recurrence['tumor-size'].isna()
Nan_number.head(100)

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
      ...  
70    False
71    False
72    False
73    False
74    False
75    False
76    False
77    False
78    False
79    False
80    False
81    False
82    False
83    False
84    False
85     True
86    False
87    False
88    False
89    False
90    False
91    False
92    False
93    False
94    False
95    False
96    False
97    False
98    False
99    False
Name: tumor-size, Length: 100, dtype: bool

In [113]:
cancer_recurrence=cancer_recurrence.dropna()

In [114]:
cancer_recurrence.head()

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,44.5,1,17.0,1.0,2,3,4,7,3,4
1,54.5,0,17.0,1.0,3,1,4,5,3,6
2,54.5,0,37.0,1.0,3,2,6,1,3,4
3,44.5,1,37.0,1.0,2,3,4,1,2,6
4,44.5,1,32.0,4.0,2,2,6,9,3,4


In [115]:
cancer_recurrence.to_csv('Cleanning recurrence.csv', encoding='utf-8',sep=',',index=False)

In [116]:
Cleanning_cancer_recurrence=pd.read_csv('Cleanning recurrence.csv', delimiter=',')

In [117]:
Cleanning_cancer_recurrence.head()

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,44.5,1,17.0,1.0,2,3,4,7,3,4
1,54.5,0,17.0,1.0,3,1,4,5,3,6
2,54.5,0,37.0,1.0,3,2,6,1,3,4
3,44.5,1,37.0,1.0,2,3,4,1,2,6
4,44.5,1,32.0,4.0,2,2,6,9,3,4


### Logistic Regression 

In [118]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import matplotlib.pyplot  as plt

%matplotlib inline


X=Cleanning_cancer_recurrence[['age','tumor-size','inv-falsedes','falsede-caps','deg-malig','breast','breast-quad','irradiat']]

y=Cleanning_cancer_recurrence['class']

X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.25, random_state=0)

log_reg=LogisticRegression()
log_reg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [119]:
print('Accuracy on the training subset: {:.3f}'.format(log_reg.score(X_train,y_train)))
print('Accuracy on the test subset: {:.3f}'.format(log_reg.score(X_test,y_test)))

Accuracy on the training subset: 0.767
Accuracy on the test subset: 0.746


In [120]:
log_reg100=LogisticRegression(C=15)
log_reg100.fit(X_train,y_train)


print('Accuracy on the training subset: {:.3f}'.format(log_reg.score(X_train,y_train)))
print('Accuracy on the test subset: {:.3f}'.format(log_reg.score(X_test,y_test)))

Accuracy on the training subset: 0.767
Accuracy on the test subset: 0.746


In [121]:
log_reg001=LogisticRegression(C=0.02)
log_reg001.fit(X_train,y_train)

print('Accuracy of LogisticRegression , on the trainning set:{:.3f}'.format(log_reg001.score(X_train,y_train)))
print('Accuracy of LogisticRegression , on the trainning set:{:.3f}'.format(log_reg001.score(X_test,y_test)))

Accuracy of LogisticRegression , on the trainning set:0.725
Accuracy of LogisticRegression , on the trainning set:0.746


In [122]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [123]:
print('Accuracy of KNN n-5, on the trainning set:{:.3f}'.format(knn.score(X_train,y_train)))
print('Accuracy of KNN n-5, on the testting set:{:.3f}'.format(knn.score(X_test,y_test)))

Accuracy of KNN n-5, on the trainning set:0.762
Accuracy of KNN n-5, on the testting set:0.714


In [124]:
# Create two lists for training and test accuracies
training_accuracy = []
test_accuracy = []

# Define a range of 1 to 10 (included) neighbors to be tested
neighbors_settings = range(1,11)

# Loop with the KNN through the different number of neighbors to determine the most appropriate (best)
for n_neighbors in neighbors_settings:
    clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    clf.fit(X_train, y_train)
    training_accuracy.append(clf.score(X_train, y_train))
    test_accuracy.append(clf.score(X_test, y_test))

In [125]:
print('Accuracy of KNN n-5, on the trainning set:{:.3f}'.format(knn.score(X_train,y_train)))
print('Accuracy of KNN n-5, on the testting set:{:.3f}'.format(knn.score(X_test,y_test)))

Accuracy of KNN n-5, on the trainning set:0.762
Accuracy of KNN n-5, on the testting set:0.714


In [126]:
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
svm = SVC()

X=Cleanning_cancer_recurrence[['age','mefalsepause','tumor-size','inv-falsedes','falsede-caps','deg-malig','breast','breast-quad','irradiat']]
#X=cancer_recurrence[['age','tumor-size','falsede-caps','deg-malig','breast','breast-quad','irradiat']]

#X=cancer_recurrence[['age','mefalsepause','inv-falsedes','falsede-caps','deg-malig','breast','breast-quad','irradiat']]

y=Cleanning_cancer_recurrence['class']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)


In [127]:
svm.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [128]:
print('The accuracy on the training subset: {:.3f}'.format(svm.score(X_train, y_train)))
print('The accuracy on the test subset: {:.3f}'.format(svm.score(X_test, y_test)))

The accuracy on the training subset: 0.878
The accuracy on the test subset: 0.714


In [129]:
min_train = X_train.min(axis=0)
range_train = (X_train - min_train).max(axis=0)

X_train_scaled = (X_train - min_train)/range_train

In [130]:
X_test_scaled = (X_test - min_train)/range_train

svm = SVC()
svm.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [131]:
print('The accuracy on the training subset: {:.3f}'.format(svm.score(X_train_scaled, y_train)))
print('The accuracy on the test subset: {:.3f}'.format(svm.score(X_test_scaled, y_test)))

The accuracy on the training subset: 0.735
The accuracy on the test subset: 0.762


In [132]:
min_train = X_train.min(axis=0)
range_train = (X_train - min_train).max(axis=0)

X_train_scaled = (X_train - min_train)/range_train
X_test_scaled = (X_test - min_train)/range_train

svm = SVC()
svm.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [133]:
svm = SVC(14)
svm=svm.fit(X_train_scaled, y_train)

print('The accuracy on the training subset: {:.3f}'.format(svm.score(X_train_scaled, y_train)))
print('The accuracy on the test subset: {:.3f}'.format(svm.score(X_test_scaled, y_test)))

The accuracy on the training subset: 0.794
The accuracy on the test subset: 0.794


In [134]:
from sklearn.externals import joblib

In [135]:
joblib.dump(clf,'recurrencemodel')

['recurrencemodel']