# Support Vector Machine with Cancer data

## Agenda 
            Quick Exploration
        1 - Training Model
            Training and test subsets Short Exemple - just see what is happening 
        2 - Implementing a SVM
        3 - Predictions vs Actual Values

In [61]:
#Machine Learning
import sklearn
from sklearn import datasets
from sklearn import svm
from sklearn import metrics

**This time we'll be using a dataset that sklearn provides us**

In [15]:
cancer = datasets.load_breast_cancer()

print('Features:\n', cancer.feature_names)

Features:
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [12]:
print('Labels: ', cancer.target_names)

array(['malignant', 'benign'], dtype='<U9')

## Quick Exploration

In [29]:
''' It's quite big to print here, but after analyzing I think it's a dictionary or some type similar '''
# cancer

In [26]:
for key in cancer:
    print(key)

data
target
frame
target_names
DESCR
feature_names
filename


## 1 - Training Model

In [35]:
features_X = cancer.data 
labels_y = cancer.target  


'''  Splitting the features and labels into random train and test subsets '''
features_X_train, features_X_test, labels_y_train, labels_y_test = sklearn.model_selection.train_test_split(features_X,
                                                                                                            labels_y,
                                                                                                            test_size=0.2)
# 0.2 (20%) of the data is being allocated as test data while the other 90% is being treated as training data

**[features_X_train and labels_y_train] will be used to train our model**<br>
(and make the machine learn)

**[features_X_test and labels_y_test] will be used to test the accuracy of our model**<br>
(ratio of number of correct predictions to the total number of input samples)

### |----- _Training and test subsets Short Exemple - just see what is happening_ -----|

In [33]:
import numpy as np

''' Here's values for X and y '''
X , y = np.arange(10).reshape((5, 2)), np.arange(5)
print('X:\n',X)
print('y:\n',y)

X:
 [[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
y:
 [0 1 2 3 4]


In [34]:
''' What we are doing '''

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.1)

print('Random X_train values:\n',X_train)
print('\nRandom y_train values:\n',y_train)
print('\nRandom X_test values:\n',X_test)
print('\nRandom y_test values:\n',y_test)

Random X_train values:
 [[0 1]
 [4 5]
 [6 7]
 [8 9]]

Random y_train values:
 [0 2 3 4]

Random X_test values:
 [[2 3]]

Random y_test values:
 [1]


#### |----- _End of exemple [ \o/ ]_ -----|

In [41]:
# Part of our data
print('features_X_train:\n\n',features_X_train[:5],'\n\nlabels_y_train:\n\n', labels_y_train[:5])

features_X_train:

 [[1.611e+01 1.805e+01 1.051e+02 8.130e+02 9.721e-02 1.137e-01 9.447e-02
  5.943e-02 1.861e-01 6.248e-02 7.049e-01 1.332e+00 4.533e+00 7.408e+01
  6.770e-03 1.938e-02 3.067e-02 1.167e-02 1.875e-02 3.434e-03 1.992e+01
  2.527e+01 1.290e+02 1.233e+03 1.314e-01 2.236e-01 2.802e-01 1.216e-01
  2.792e-01 8.158e-02]
 [8.597e+00 1.860e+01 5.409e+01 2.212e+02 1.074e-01 5.847e-02 0.000e+00
  0.000e+00 2.163e-01 7.359e-02 3.368e-01 2.777e+00 2.222e+00 1.781e+01
  2.075e-02 1.403e-02 0.000e+00 0.000e+00 6.146e-02 6.820e-03 8.952e+00
  2.244e+01 5.665e+01 2.401e+02 1.347e-01 7.767e-02 0.000e+00 0.000e+00
  3.142e-01 8.116e-02]
 [1.969e+01 2.125e+01 1.300e+02 1.203e+03 1.096e-01 1.599e-01 1.974e-01
  1.279e-01 2.069e-01 5.999e-02 7.456e-01 7.869e-01 4.585e+00 9.403e+01
  6.150e-03 4.006e-02 3.832e-02 2.058e-02 2.250e-02 4.571e-03 2.357e+01
  2.553e+01 1.525e+02 1.709e+03 1.444e-01 4.245e-01 4.504e-01 2.430e-01
  3.613e-01 8.758e-02]
 [1.571e+01 1.393e+01 1.020e+02 7.617e+02 9.462

In [42]:
# We will use this later to instead of looking at labels like those above: [0 1 0 1 1]
# We'll look at them like this:
classes = ['malignant', 'benign']

## Implementing a SVM

In [47]:
# Creating a SVC() instance, so we can use its methods to classify
svm_classifier = svm.SVC()

# Training the model
svm_classifier.fit(features_X_train, labels_y_train)


# Comparing the predictions with the actual data to find out the accuracy
labels_y_predictions = svm_classifier.predict(features_X_test)
accuracy = metrics.accuracy_score(labels_y_test, labels_y_predictions)

print(accuracy)

0.9122807017543859


### 3 - Predictions vs Actual Values

In [66]:
# As I said, doing this, We'll get not just the number(0 or 1) but its actual meaning
classes = ['malignant', 'benign']

predicted_values = svm_classifier.predict(features_X_test)

break_point = 0 # I don't want to print the whole dataset
for value in range(len(features_X_test)):
    print('Predicted value: ', predicted_values[value], '-->', classes[predicted_values[value]])
    print('\nInput Data:\n', features_X_test[value])
    print('\nActual value', labels_y_test[value], '  -->   ', classes[labels_y_test[value]])
    print('-'*50,'\n\n')
    if break_point == 10: break
    break_point += 1

Predicted value:  1 --> benign

Input Data:
 [1.106e+01 1.483e+01 7.031e+01 3.782e+02 7.741e-02 4.768e-02 2.712e-02
 7.246e-03 1.535e-01 6.214e-02 1.855e-01 6.881e-01 1.263e+00 1.298e+01
 4.259e-03 1.469e-02 1.940e-02 4.168e-03 1.191e-02 3.537e-03 1.268e+01
 2.035e+01 8.079e+01 4.967e+02 1.120e-01 1.879e-01 2.079e-01 5.556e-02
 2.590e-01 9.158e-02]

Actual value 1   -->    benign
-------------------------------------------------- 


Predicted value:  1 --> benign

Input Data:
 [1.205e+01 2.272e+01 7.875e+01 4.478e+02 6.935e-02 1.073e-01 7.943e-02
 2.978e-02 1.203e-01 6.659e-02 1.194e-01 1.434e+00 1.778e+00 9.549e+00
 5.042e-03 4.560e-02 4.305e-02 1.667e-02 2.470e-02 7.358e-03 1.257e+01
 2.871e+01 8.736e+01 4.884e+02 8.799e-02 3.214e-01 2.912e-01 1.092e-01
 2.191e-01 9.349e-02]

Actual value 1   -->    benign
-------------------------------------------------- 


Predicted value:  1 --> benign

Input Data:
 [1.133e+01 1.416e+01 7.179e+01 3.966e+02 9.379e-02 3.872e-02 1.487e-03
 3.333e-03