In [1]:
import numpy as np
import pandas as pd

from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing, cross_validation

import matplotlib.pyplot as plt

In [2]:
df = pd.read_excel('titanic.xls')
original_df = pd.DataFrame.copy(df)

df.drop(['body','name'], 1, inplace=True)
df.fillna(0, inplace=True)

In [3]:
def handle_non_numerical_data(df):
    
    # handling non-numerical data: must convert.
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        #print(column,df[column].dtype)
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            
            column_contents = df[column].values.tolist()
            #finding just the uniques
            unique_elements = set(column_contents)
            # great, found them. 
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    # creating dict that contains new
                    # id per unique string
                    text_digit_vals[unique] = x
                    x+=1
            # now we map the new "id" vlaue
            # to replace the string. 
            df[column] = list(map(convert_to_int,df[column]))

    return df

In [4]:
df = handle_non_numerical_data(df)
df.drop(['ticket','home.dest'], 1, inplace=True)

X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

In [5]:
clf = MeanShift()
clf.fit(X)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [6]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_

original_df['cluster_group'] = np.nan

for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [7]:
n_clusters_ = len(np.unique(labels))

survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[(original_df['cluster_group'] == float(i))]
    survival_cluster = temp_df[(temp_df['survived'] == 1)]
    survival_rate = len(survival_cluster) / len(temp_df)
    survival_rates[i] = survival_rate
    
print(survival_rates)

{0: 0.37672904800650936, 1: 0.2, 2: 1.0, 3: 0.0, 4: 0.5, 5: 0.6071428571428571}


In [8]:
cluster_0 = original_df[ (original_df['cluster_group']==0) ]
cluster_0_fc = cluster_0[ (cluster_0['pclass']==1) ]
cluster_0_fc.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,275,275.0,237.0,275.0,275.0,275.0,29.0,275
mean,1,0.610909,39.603024,0.370909,0.218182,63.060257,176.37931,0
std,0,0.488433,14.123195,0.520273,0.49416,41.5524,83.340272,0
min,1,0.0,0.9167,0.0,0.0,0.0,16.0,0
25%,1,0.0,29.0,0.0,0.0,29.85,126.0,0
50%,1,1.0,39.0,0.0,0.0,53.1,175.0,0
75%,1,1.0,49.0,1.0,0.0,79.825,245.0,0
max,1,1.0,80.0,2.0,2.0,227.525,307.0,0
