In [96]:
## Data Preprocessing

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import cross_val_score
import sklearn
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
import plotly # make sure you install plotly! you do so by $ pip install plotly on the command line
import plotly.graph_objs as go
 
%matplotlib inline

In [97]:

# we can also upload the breast cancer wisconsin data directly from sklearn
# number of instance: 569
# number of attributes: 30 
bunch = load_breast_cancer()

# we created a dataframe to better work with, and visulize, the data 
dataframe = pd.DataFrame(bunch.data, columns=bunch.feature_names)

# we can decide how many of the sample we want to see,
# let's print all samples. so n = 569
dataframe.head(569)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.990,10.38,122.80,1001.0,0.11840,0.27760,0.300100,0.147100,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.71190,0.26540,0.4601,0.11890
1,20.570,17.77,132.90,1326.0,0.08474,0.07864,0.086900,0.070170,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.24160,0.18600,0.2750,0.08902
2,19.690,21.25,130.00,1203.0,0.10960,0.15990,0.197400,0.127900,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.45040,0.24300,0.3613,0.08758
3,11.420,20.38,77.58,386.1,0.14250,0.28390,0.241400,0.105200,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.68690,0.25750,0.6638,0.17300
4,20.290,14.34,135.10,1297.0,0.10030,0.13280,0.198000,0.104300,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.40000,0.16250,0.2364,0.07678
5,12.450,15.70,82.57,477.1,0.12780,0.17000,0.157800,0.080890,0.2087,0.07613,...,15.470,23.75,103.40,741.6,0.17910,0.52490,0.53550,0.17410,0.3985,0.12440
6,18.250,19.98,119.60,1040.0,0.09463,0.10900,0.112700,0.074000,0.1794,0.05742,...,22.880,27.66,153.20,1606.0,0.14420,0.25760,0.37840,0.19320,0.3063,0.08368
7,13.710,20.83,90.20,577.9,0.11890,0.16450,0.093660,0.059850,0.2196,0.07451,...,17.060,28.14,110.60,897.0,0.16540,0.36820,0.26780,0.15560,0.3196,0.11510
8,13.000,21.82,87.50,519.8,0.12730,0.19320,0.185900,0.093530,0.2350,0.07389,...,15.490,30.73,106.20,739.3,0.17030,0.54010,0.53900,0.20600,0.4378,0.10720
9,12.460,24.04,83.97,475.9,0.11860,0.23960,0.227300,0.085430,0.2030,0.08243,...,15.090,40.68,97.65,711.4,0.18530,1.05800,1.10500,0.22100,0.4366,0.20750


In [98]:
# We preprocess the data, namely with standardization instead of normalization
scaler = StandardScaler()
scaler.fit(dataframe)
data_standardization = scaler.transform(dataframe)

In [99]:
# we now want to do features substraction. We want to retrieve three features that 
# are most important in rightfully classifying our data


# Decompose data (PCA).
pca = PCA(n_components=3)
pca.fit(data_standardization)
data_decomposed = pca.transform(data_standardization)

# Get explained variance.
print(pca.explained_variance_ratio_, sum(pca.explained_variance_ratio_))



[ 0.44272026  0.18971182  0.09393163] 0.726363709089


In [100]:
# the line below is needed in order to plot graphs offline inside Jupyter
init_notebook_mode(connected=True)

# Create heat map data.
data = go.Heatmap(z=pca.components_, 
                  x=bunch.feature_names, 
                  y=['PC 1', 'PC 2', 'PC 3'], 
                  colorscale='Viridis')
 
# Plot heatmap.
plotly.offline.iplot([data], filename='heatmap')

In [101]:
# Add malignant column.
decomposed_df = pd.DataFrame(data_decomposed, columns=['x', 'y', 'z'])
decomposed_df['malignant'] = 1 - bunch.target
 
# Create individual data sets.
malignant = decomposed_df[decomposed_df.malignant == 1]
benign = decomposed_df[decomposed_df.malignant == 0]

In [102]:
# IMPORTANT
# it could be the case that you have problems representing the graph 
# because of no access to webGL (https://get.webgl.org/)
# if it doesn't work on safari, 
# do Menu > settings > Advanced settings > Under ’system’ Uncheck Use hardware acceleration when available)

# Create line style.
line_style = dict(color='rgba(0, 0, 0, 0.14)',width=0.5)
 
# Create scatters.
malignant_scatter = go.Scatter3d(
    x=malignant['x'],
    y=malignant['y'],
    z=malignant['z'],
    mode='markers',
    marker=dict(
        color='rgb(181, 20, 37)',
        size=12,
        opacity=0.8,
        line=line_style
    ),
    name='Malignant'
)
benign_scatter = go.Scatter3d(
    x=benign['x'],
    y=benign['y'],
    z=benign['z'],
    mode='markers',
    marker=dict(
        color='rgb(5, 99, 226)',
        size=12,
        opacity=0.8,
        line=line_style
    ),
    name='Benign'
)
 
# Create data array. Ensure malignant scatter is rendered above (can we merge layers somehow?).
data = [benign_scatter, malignant_scatter]
 
# Create layout.
layout = go.Layout(showlegend=True, margin=dict(l=0,r=0,b=0,t=0))
 
# Render (offline).
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='3d-scatter')