In [2]:
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing, cross_validation
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
df = pd.read_excel('data/titanic.xls')

original_df = pd.DataFrame.copy(df)
df.drop(['body','name'], 1, inplace=True)
df.fillna(0,inplace=True)

In [5]:
def handle_non_numerical_data(df):
    
    # handling non-numerical data: must convert.
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        #print(column,df[column].dtype)
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            
            column_contents = df[column].values.tolist()
            #finding just the uniques
            unique_elements = set(column_contents)
            # great, found them. 
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    # creating dict that contains new
                    # id per unique string
                    text_digit_vals[unique] = x
                    x+=1
            # now we map the new "id" vlaue
            # to replace the string. 
            df[column] = list(map(convert_to_int,df[column]))

    return df

In [6]:
df = handle_non_numerical_data(df)
df.drop(['ticket','home.dest'], 1, inplace=True)

X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [9]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_
original_df['cluster_group']=np.nan
for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [10]:
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[ (original_df['cluster_group']==float(i)) ]
    #print(temp_df.head())

    survival_cluster = temp_df[  (temp_df['survived'] == 1) ]

    survival_rate = len(survival_cluster) / len(temp_df)
    #print(i,survival_rate)
    survival_rates[i] = survival_rate
    
print(survival_rates)

{0: 0, 1: 0, 2: 1, 3: 0}


In [12]:
print(original_df[ (original_df['cluster_group']==1) ])

     pclass  survived                                               name  \
1         1         1                     Allison, Master. Hudson Trevor   
2         1         0                       Allison, Miss. Helen Loraine   
3         1         0               Allison, Mr. Hudson Joshua Creighton   
4         1         0    Allison, Mrs. Hudson J C (Bessie Waldo Daniels)   
10        1         0                             Astor, Col. John Jacob   
11        1         1  Astor, Mrs. John Jacob (Madeleine Talmadge Force)   
16        1         0                           Baxter, Mr. Quigg Edmond   
17        1         1    Baxter, Mrs. James (Helene DeLaudeniere Chaput)   
23        1         1                              Bidois, Miss. Rosalie   
24        1         1                                  Bird, Miss. Ellen   
35        1         1                           Bowen, Miss. Grace Scott   
54        1         1                Carter, Master. William Thornton II   
55        1 

In [11]:
print(original_df[ (original_df['cluster_group']==0) ].describe())
print(original_df[ (original_df['cluster_group']==2) ].describe())

            pclass     survived         age        sibsp        parch  \
count  1248.000000  1248.000000  987.000000  1248.000000  1248.000000   
mean      2.342147     0.371795   29.429838     0.483974     0.302885   
std       0.813333     0.483478   14.201267     1.049565     0.655315   
min       1.000000     0.000000    0.166700     0.000000     0.000000   
25%       2.000000     0.000000         NaN     0.000000     0.000000   
50%       3.000000     0.000000         NaN     0.000000     0.000000   
75%       3.000000     1.000000         NaN     1.000000     0.000000   
max       3.000000     1.000000   80.000000     8.000000     4.000000   

              fare        body  cluster_group  
count  1247.000000  113.000000         1248.0  
mean     25.413876  162.123894            0.0  
std      28.856902   98.916644            0.0  
min       0.000000    1.000000            0.0  
25%            NaN         NaN            0.0  
50%            NaN         NaN            0.0  
75%   

