# Cancer Prediction

In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

import matplotlib.pyplot as plt
from matplotlib import figure

%matplotlib inline

This is what the data represents:

1. Sample code number: id number 
2. Clump Thickness: 1 - 10 
3. Uniformity of Cell Size: 1 - 10 
4. Uniformity of Cell Shape: 1 - 10 
5. Marginal Adhesion: 1 - 10 
6. Single Epithelial Cell Size: 1 - 10 
7. Bare Nuclei: 1 - 10 
8. Bland Chromatin: 1 - 10 
9. Normal Nucleoli: 1 - 10 
10. Mitoses: 1 - 10 
11. Class: (2 for benign, 4 for malignant)

In [32]:
df = pd.read_csv('C:\\users\\kim\\downloads\\cancer.csv',delimiter=',')
df.head()

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


In [33]:
columns=['id','thickness','cell_size','cell_shape','adhesion','eithelial_cell_size',\
         'bare_nuclei','chromatin','normal_nuclei','mitoses','class']
df.columns = columns

In [34]:
df.head()

Unnamed: 0,id,thickness,cell_size,cell_shape,adhesion,eithelial_cell_size,bare_nuclei,chromatin,normal_nuclei,mitoses,class
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


In [35]:
df.columns

Index(['id', 'thickness', 'cell_size', 'cell_shape', 'adhesion',
       'eithelial_cell_size', 'bare_nuclei', 'chromatin', 'normal_nuclei',
       'mitoses', 'class'],
      dtype='object')

In [36]:
df['class'] = np.where(df['class']>3, 1, 0)

In [37]:
df.replace('?',0,inplace=True)

In [38]:
features = df.copy()
del features['id']

In [39]:
label = df['class']

In [40]:
X = features.as_matrix()
Y = label.as_matrix()

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [42]:
log = LogisticRegression()
log.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [43]:
log.score(X_test,y_test)

1.0

In [44]:
log.score(X_train, y_train)

1.0

In [46]:
log.score(X,Y)

1.0

In [47]:
svc = SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [48]:
svc.score(X_test, y_test)

0.9642857142857143

In [50]:
kbest = SelectKBest(score_func=chi2)

In [52]:
kbest.fit_transform(features, df['class'])

array([[  5.,   4.,   4., ...,   2.,   1.,   0.],
       [  3.,   1.,   1., ...,   1.,   1.,   0.],
       [  6.,   8.,   8., ...,   7.,   1.,   0.],
       ..., 
       [  5.,  10.,  10., ...,  10.,   2.,   1.],
       [  4.,   8.,   6., ...,   6.,   1.,   1.],
       [  4.,   8.,   8., ...,   4.,   1.,   1.]])

In [53]:
records = list(zip(kbest.scores_.tolist(),features.columns.tolist() ))
feature_scores = pd.DataFrame.from_records(records)
feature_scores.columns = ["Score", "feature"]
feature_scores.sort_values(by="Score", ascending=False).head()

Unnamed: 0,Score,feature
5,1782.316295,bare_nuclei
1,1384.333895,cell_size
2,1286.312405,cell_shape
7,1149.404078,normal_nuclei
3,982.390545,adhesion
