In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import KFold
from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans
from matplotlib import colors as mcolors
from sklearn.model_selection import train_test_split as tts



from surprise import SVD
from lightfm import LightFM
from surprise import evaluate, print_perf


from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
import plotly.tools as tls
%matplotlib inline

In [2]:
tools.set_credentials_file(username='-', api_key='-')

# DATA IMPORT AND CLEAN

In [3]:
df = pd.read_csv('printcsvreports .csv')
df['patient age'] = df['patient age'].values/ 12
df.drop(['patient occupation', 'agriworker', 'homelessstatus', 'pblchouspat','vetstatus'], axis = 1, inplace=True)
df2 = df.copy()
df.drop(['patientid'], axis = 1, inplace=True)

In [6]:
for column in df.columns:
    df[column]=df[column].fillna(0)

    
    non_vals = []
for column in df.columns:

    if df[column].dtype not in ['int64', 'float64']:
        non_vals.append(column)

df_cleaned = pd.get_dummies(df, non_vals, drop_first=True)
X = df_cleaned.values

In [7]:
len(df['race'].unique())

27

In [8]:
len(df['ethnicity'].unique())

19

In [9]:
df_cleaned.head()

Unnamed: 0,patient age,patientsex_M,ethnicity_Andalusian,ethnicity_Asturian,ethnicity_Central American,ethnicity_Cuban,ethnicity_Dominican,ethnicity_Ecuadorian,ethnicity_Gallego,ethnicity_Hispanic or Latino/Spanish,...,patient city_WINLOCK,patient city_WINTER HARBOR,patient city_WOBURN,patient city_WOODSTOCK,patient city_WORCESTER,patient city_WORTHINGTON,patient city_WORTHINGTON SPRINGS,patient city_WRAY,patient city_YAKIMA,patient city_YORK BEACH
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
def make_Kmeans(arr, n):
    ms = KMeans(n_clusters = n, n_jobs=-1, verbose=0)
    ms.fit(arr)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    
    return labels, cluster_centers

def plot_clusters(arr, labels, cluster_centers):
    #Create Color Dictionary
    colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
    a = np.random.randint(40, len(colors), size = len(set(labels)))
    label_colors = dict()
    for index, color_num in enumerate(list(a)):
        label_colors[index] = colors.keys()[color_num]
    
    # Make Plot
    figure = plt.figure(figsize = (10, 10))
    for i in labels:
        plt.scatter(arr[:,0], arr[:,1], c = label_colors[i], alpha = 0.8)
    plt.scatter(cluster_centers[:,0], cluster_centers[:,1], s = 100, c = 'r', marker='o')
    plt.show()

In [11]:
def make_MeanShift(arr):
    bandwidth = estimate_bandwidth(arr, quantile=0.4)

    ms = MeanShift(bandwidth = bandwidth, bin_seeding=True)
    ms.fit(arr)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    return labels, cluster_centers

In [12]:
def add_labels(df, labels):
    df['labels'] = labels

In [13]:
def make_dflist(df, labels):
    df_list = []
    for label in set(labels):
        df_list.append(df[df['labels']==label])
    return df_list

In [14]:
labels, clusters = make_MeanShift(X)
add_labels(df2, labels)
df_list = make_dflist(df2, labels)
for data_frame in df_list:
    print data_frame.shape

(1220, 7)
(26, 7)
(8, 7)
(7, 7)
(1, 7)


In [15]:
df_cleaned['labels'] = labels

In [16]:
labels, clusters = make_Kmeans(X, 4)
add_labels(df2, labels)
df_list = make_dflist(df2, labels)
for data_frame in df_list:
    print data_frame.shape

(397, 7)
(20, 7)
(165, 7)
(680, 7)


In [17]:
new_df_list = []
for dfs in df_list:
    new_df_list.append(dfs['patientid'].values)

In [18]:
index_list = ['patientid']
index_list.extend(['d%s' %i for i in range(1, 21)])

In [19]:
data_list = []
for dfs in df_list:
    data = np.random.choice([0, 1], size = (dfs.shape[0], len(index_list)-1), p = [0.90, 0.10])
    data_list.append(data)

In [20]:
for i in range(len(df_list)):
    print new_df_list[i].shape, data_list[i].shape

(397,) (397, 20)
(20,) (20, 20)
(165,) (165, 20)
(680,) (680, 20)


In [21]:
rec_dfs = []
for df_num in range(len(df_list)):
    df_1 = pd.DataFrame(new_df_list[df_num], columns = [index_list[0]], index = None)
    df_2 = pd.DataFrame(data_list[df_num], columns = index_list[1::], index = None)
    rec_dfs.append(pd.concat(objs = [df_1, df_2], axis = 1))
    # TODO: Change NaN values

In [22]:
def get_rec(df_):
    df_s = pd.DataFrame.as_matrix(df_)
    kf = KFold(n_splits=2, random_state=42, shuffle=True)
    kf.split(X = df_s[1,:], y = df_s[0,:])
    # We'll use the famous SVD algorithm.
    algo = SVD()

    # Evaluate performances of our algorithm on the dataset.
    for train_index, test_index in kf.split(df_s):
        perf = evaluate(algo, df_s[train_index], measures=['RMSE', 'MAE'])
        return print_perf(perf)

# bring up grouping race/ethnicity 

In [38]:
tsne = TSNE(n_components=3, random_state=42)
tsne_vectors = tsne.fit_transform(df_cleaned.values)

In [39]:
tsne_total = np.column_stack((tsne_vectors, labels))

In [40]:
tsne_total.shape

(1262, 4)

In [41]:
tsne_df = pd.DataFrame(data = tsne_total, columns = ['D1', 'D2', 'D3', 'Group'])

In [42]:
trace0 = go.Scatter3d(
    x=tsne_df.query('Group == 0.0')['D1'],
    y=tsne_df.query('Group == 0.0')['D2'],
    z=tsne_df.query('Group == 0.0')['D3'],
    name = 'Group 0',
    mode='markers',
    marker=dict(
        size=12,
        line=dict(
            color='rgba(217, 217, 217, 0.14)',
            width=0.5
        ),
        opacity=0.8
    )
)

trace1 = go.Scatter3d(
    x=tsne_df.query('Group == 1.0')['D1'],
    y=tsne_df.query('Group == 1.0')['D2'],
    z=tsne_df.query('Group == 1.0')['D3'],
    name = 'Group 1',
    mode='markers',
    marker=dict(
        size=12,
        line=dict(
            color='rgba(217, 217, 217, 0.14)',
            width=0.5
        ),
        opacity=0.8
    )
)

trace2 = go.Scatter3d(
    x=tsne_df.query('Group == 2.0')['D1'],
    y=tsne_df.query('Group == 2.0')['D2'],
    z=tsne_df.query('Group == 2.0')['D3'],
    name = 'Group 2',
    mode='markers',
    marker=dict(
        size=12,
        line=dict(
            color='rgba(217, 217, 217, 0.14)',
            width=0.5
        ),
        opacity=0.8
    )
)

trace3 = go.Scatter3d(
    x=tsne_df.query('Group == 3.0')['D1'],
    y=tsne_df.query('Group == 3.0')['D2'],
    z=tsne_df.query('Group == 3.0')['D3'],
    name = 'Group 3',
    mode='markers',
    marker=dict(
        size=12,
        line=dict(
            color='rgba(217, 217, 217, 0.14)',
            width=0.5
        ),
        opacity=0.8
    )
)

trace4 = go.Scatter3d(
    x=tsne_df.query('Group == 4.0')['D1'],
    y=tsne_df.query('Group == 4.0')['D2'],
    z=tsne_df.query('Group == 4.0')['D3'],
    name = 'Group 3',
    mode='markers',
    marker=dict(
        size=12,
        line=dict(
            color='rgba(217, 217, 217, 0.14)',
            width=0.5
        ),
        opacity=0.8
    )
)

In [43]:
data = [trace0, trace1, trace2, trace3, trace4]
layout = go.Layout(
    title='Cluster Analysis of Patients by Demographics and Injury',
    font=dict(size=16, color='#7f7f7f '),
    height = 800,
    width = 900,
    legend=dict(
        x=0,
        y=0.90,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)',
        font=dict(
            size=14,
            color='#000'
            ),
        ),
    )

In [44]:
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Hackathon-ClusterAnalysis')

High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~erdos2n/0 or inside your plot.ly account where it is named 'Hackathon-ClusterAnalysis'


In [None]:
evaluate?