In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, cross_validation

import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
%matplotlib inline

In [16]:
'''
X = np.array([[1, 2],[1.5, 1.8],[5, 8],
             [8, 8],[1, 0.6],[9, 11]])

plt.scatter(X[:, 0], X[:, 1], s=150, linewidths=5)
plt.show()

colors = ["g", "r", "c", "b", "k"]
'''

df = pd.read_excel('titanic.xls')

In [17]:
df.drop(['body','name'], 1, inplace=True)
df.convert_objects(convert_numeric=True)
df.fillna(0, inplace=True)

  from ipykernel import kernelapp as app


In [18]:
def handle_non_numerical_data(df):
    columns = df.columns.values
    
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 or df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1
                    
            df[column] = list(map(convert_to_int, df[column]))
            
    return df

In [19]:
df = handle_non_numerical_data(df)

df.drop(['ticket','home.dest'], 1, inplace=True)
#df.drop(['boat'], 1, inplace=True)
print(df.head())

X = np.array(df.drop(['survived'],1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

   pclass  survived  sex  age  sibsp  parch  fare  cabin  embarked  boat
0       0         1    0   30      0      0   176     70         3     1
1       0         1    1   92      1      2   133     49         3    22
2       0         0    0    2      1      2   133     49         3     0
3       0         0    1   31      1      2   133     49         3     0
4       0         0    0   25      1      2   133     49         3     0


In [20]:
class K_Means:
    def __init__(self, k=2, tol=0.001, max_iter=300):
        self.k = k
        self. tol = tol
        self. max_iter = max_iter
    
    def fit(self, data):
        self.centroids = {}
        
        for i in range(self.k):
            self.centroids[i] = data[i]
            
        for i in range(self.max_iter):
            self.classifications = {}
            
            for j in range(self.k):
                self.classifications[j] = []
                
            for featureset in data:
                distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in self.centroids]
                classification = distances.index(min(distances))
                self.classifications[classification].append(featureset)
            
            prev_centroids = dict(self.centroids)
            
            for classification in self.classifications:
                self.centroids[classification] = np.average(self.classifications[classification], axis=0)
            optimized = True
            
            for c in self.centroids:
                original_centroid = prev_centroids[c]
                current_centroid = self.centroids[c]
                if (np.sum((current_centroid - original_centroid) / original_centroid * 100.0)) > self.tol:
                    optimized = False
                    
            if optimized:
                break

    def predict(self, data):
        distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        return classification


In [21]:
clf = K_Means()
clf.fit(X)

correct = 0
for i in range(len(X)):

    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = clf.predict(predict_me)
    if prediction == y[i]:
        correct += 1


print(correct/len(X))

'''
for centroid in clf.centroids:
    plt.scatter(clf.centroids[centroid][0], clf.centroids[centroid][1],
               marker='o', color='k', s=150, linewidths=5)
    
for classification in clf.classifications:
    color = colors[classification]
    for featureset in clf.classifications[classification]:
        plt.scatter(featureset[0], featureset[1], marker='x', color=color, s=150, linewidths=5)

unknowns = np.array([[1,3], [8,9], [0,3], [5,4], [6,4]])

for unknown in unknowns:
    classification = clf.predict(unknown)
    #print(classification)
    plt.scatter(unknown[0], unknown[1], marker="*", color=colors[classification], s=150, linewidths=5)
        
plt.show()
'''

0.6371275783040489


'\nfor centroid in clf.centroids:\n    plt.scatter(clf.centroids[centroid][0], clf.centroids[centroid][1],\n               marker=\'o\', color=\'k\', s=150, linewidths=5)\n    \nfor classification in clf.classifications:\n    color = colors[classification]\n    for featureset in clf.classifications[classification]:\n        plt.scatter(featureset[0], featureset[1], marker=\'x\', color=color, s=150, linewidths=5)\n\nunknowns = np.array([[1,3], [8,9], [0,3], [5,4], [6,4]])\n\nfor unknown in unknowns:\n    classification = clf.predict(unknown)\n    #print(classification)\n    plt.scatter(unknown[0], unknown[1], marker="*", color=colors[classification], s=150, linewidths=5)\n        \nplt.show()\n'