In [2]:
#Basic imports
import numpy as np
import pandas as pd

#sklearn imports
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.preprocessing import StandardScaler #used for 'Feature Scaling'

#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot





In [None]:
def get_breast_cancer_data():
    df = pd.read_csv("C:/Users/MMach/Desktop/datascience/breastcancer/data.csv")
    del df["Unnamed: 32"]
    del df["id"]

    diagnosis = df["diagnosis"]
    data_wo_diagnosis = df.drop(["diagnosis"])

    scaler = StandardScaler()
    data_wo_diagnosis = pd.DataFrame(scaler.fit_transform(data_wo_diagnosis))


    kmeans = KMeans(n_clusters=3)
    kmeans.fit(data_wo_diagnosis)

    #Find which cluster each data-point belongs to
    clusters = kmeans.predict(data_wo_diagnosis)
    data_wo_diagnosis["cluster"] = clusters

    
    data = pd.concat([diagnosis,data_wo_diagnosis], axis=1, join='inner')
    return data

In [37]:
def pca3(data)    
    pca_3d = PCA(n_components=3)
    PCs_3d = pd.DataFrame(pca_3d.fit_transform(data.drop(["cluster"], axis=1)))
    PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]
    
    return pd.concat([data, PCs_3d], axis=1, join='inner')


In [38]:

def split_data(data):
    return_data = dict()
    
    return_data["cluster0_B"] = data[(data["cluster"] == 0) & (data["diagnosis"] == "B")]
    return_data["cluster1_B"] = data[(data["cluster"] == 1) & (data["diagnosis"] == "B")]
    return_data["cluster2_B"] = data[(data["cluster"] == 2) & (data["diagnosis"] == "B")]
    return_data["cluster0_M"] = data[(data["cluster"] == 0) & (data["diagnosis"] == "M")]
    return_data["cluster1_M"] = data[(data["cluster"] == 1) & (data["diagnosis"] == "M")]
    return_data["cluster2_M"] = data[(data["cluster"] == 2) & (data["diagnosis"] == "M")]
    
    return return_data

def build_traces_3D(data_dict, columns_to_plot):
    x,y,z = columns_to_plot
    traces = list()
    for key, data in data_dict.items():
        traces.append(
            go.Scatter3d(
                    x = data[x],
                    y = data[y],
                    z = data[z],
                    mode = "markers",
                    name = key,
                    # marker = dict(color = 'rgba(64, 0, 0, 0.8)'),
                    text = None
            )
        )
    return traces
        


In [39]:
init_notebook_mode(connected=True)

#Instructions for building the 3-D plot

#trace1 is for 'Cluster 0'


plot_data = build_traces_3D(split_data(
    pca3(get_breast_cancer_data()), ("PC1_3d", "PC2_3d", "PC3_3d")
)

layout = dict(title = "Visualizing Clusters in Three Dimensions Using PCA",
              xaxis= dict(title='PC1',ticklen=5,zeroline=False),
              yaxis= dict(title='PC2',ticklen=5,zeroline=False)
             )

fig = dict(data=plot_data, layout=layout)
iplot(fig)


### t-SNE is kind of magic, so do not misinterpret it
https://distill.pub/2016/misread-tsne/ <br>
t-SNE's compelling low dimensional maps can be easy to read into, but take caution. t-SNE cares, most of all, about being a "continuous" map in the sense that what was close to each other in the source space wants to be close to each other in the target space, enforced by a cost function. t-SNE cares much less about keeping what was far in the source space very far from each other in the target space (i.e. being "open"). And actually some outliers show this in the t-SNE plot below. 

The perplexity variable is a guess (or information if you will) about the number of close neighbours a point has. It impacts how sharp the Gaussian function is going to be and what the algorithm sees as local structure vs global structure.  Since t-SNE only cares about local structure, this is really important for the result - if the perplexity is very low, ultra local features will dominate, blowing up the data into bits and pieces with little regard for more the macroscopic, like clustering. Conversely, if the perplexity is high, it is going to mix all the data together in one chaotic blob.

Also look at this 
https://scikit-learn.org/stable/auto_examples/manifold/plot_t_sne_perplexity.html#sphx-glr-auto-examples-manifold-plot-t-sne-perplexity-py

In [46]:
def tsne_for_given_perplexity(perp):
    data = get_breast_cancer_data()
    tsne_3d = TSNE(n_components=3, perplexity=perp)
    TCs_3d = pd.DataFrame(tsne_3d.fit_transform(data.drop(["cluster", "diagnosis"], axis=1)))
    TCs_3d.columns = ["TC1_3d","TC2_3d","TC3_3d"]

    data = pd.concat([data, TCs_3d], axis=1, join='inner')
    plot_data = build_traces_3D(split_data(data), ["TC1_3d","TC2_3d","TC3_3d"])

    layout = dict(title = "Breast Cancer with tSNE: {}".format(perp),
                  xaxis= dict(title='PC1',ticklen=5,zeroline=False),
                  yaxis= dict(title='PC2',ticklen=5,zeroline=False)
                 )

    fig = dict(data = plot_data, layout = layout)
    iplot(fig)
    
for perp in (1, 5, 10, 50):
    tsne_for_given_perplexity(perp)