# Objective

Predict the survival of Titanic passengers using a K-Means algorithm.

## Data Analysis

### Data Import

In [1]:
import pandas
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.cluster import KMeans
from pprint import pprint

TITANIC_TRAIN = 'train.csv'
TITANIC_TEST = 'test.csv'
# t_df refers to titanic_dataframe
t_df = pandas.read_csv(TITANIC_TRAIN, header=0)

## Selection of Features

In [2]:
t_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked', 'Sex', 'Pclass'], axis=1, inplace=True)
t_df.info()
t_df.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
dtypes: float64(2), int64(4)
memory usage: 41.8 KB


Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare
0,1,0,22.0,1,0,7.25


## Cleaning Data

In [3]:
t_df.Age.fillna(np.mean(t_df.Age), inplace=True)
t_df.info()
t_df.head(5)
t_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
dtypes: float64(2), int64(4)
memory usage: 41.8 KB


Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,35.0,1.0,0.0,31.0
max,891.0,1.0,80.0,8.0,6.0,512.3292


## Experiment Heueristics (Design)

### Evaluation Function Declarations

F1 score to be used to evaluate algoritm results.

In [4]:
def precision(tp, fp):
    return tp / (tp + fp)
    
def recall(tp, fn):
    return tp / (tp + fn)
    
def f1_score(tp, fn, fp):
    pre = precision(tp, fp)
    rec = recall(tp, fn)
    return (2 * ((pre * rec) / (pre + rec)))

### Representation

In [10]:
train, test = train_test_split(t_df, test_size = 0.2)
y = train['Survived']
# y = test
# x = train['Age'].values[:,np.newaxis]
x = train
print(x)
# print(len(train), len(train['Survived']), len(x))

     PassengerId  Survived        Age  SibSp  Parch      Fare
723          724         0  50.000000      0      0   13.0000
349          350         0  42.000000      0      0    8.6625
297          298         0   2.000000      1      2  151.5500
199          200         0  24.000000      0      0   13.0000
238          239         0  19.000000      0      0   10.5000
713          714         0  29.000000      0      0    9.4833
182          183         0   9.000000      4      2   31.3875
368          369         1  29.699118      0      0    7.7500
643          644         1  29.699118      0      0   56.4958
620          621         0  27.000000      1      0   14.4542
165          166         1   9.000000      0      2   20.5250
300          301         1  29.699118      0      0    7.7500
22            23         1  15.000000      0      0    8.0292
703          704         0  25.000000      0      0    7.7417
172          173         1   1.000000      1      1   11.1333
107     

## Experiment

In [11]:
k = 2
# x.values.reshape(-1, 1)
kmeans = KMeans(n_clusters=k)
results = kmeans.fit_predict(x.values, y.values)
print(results) 
train['Survived']

[0 1 1 1 1 0 1 1 0 0 1 1 1 0 1 1 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1
 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 1 0 1 0 1 0 1 1 1 0 0 0 0 0
 0 1 1 1 1 1 0 1 1 0 1 1 1 1 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0
 0 0 1 0 1 1 1 0 0 0 0 0 1 1 1 0 0 1 1 1 1 1 0 0 0 1 0 1 1 0 1 1 1 0 0 1 0
 0 0 0 1 1 1 0 0 1 1 1 0 1 0 0 0 1 0 1 1 1 0 0 0 1 1 1 1 1 0 1 0 0 1 0 1 0
 0 1 0 0 0 0 1 0 1 1 0 1 1 1 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 1 1 1 1 0 1 1 0 1 1 0 1 1 0 0 1
 0 1 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 0 0 1
 1 0 0 1 0 0 0 1 0 0 0 1 1 1 1 0 0 0 0 0 0 1 0 1 1 0 0 1 0 1 1 1 1 0 0 0 0
 1 1 0 1 1 0 1 0 1 1 0 0 1 0 1 1 0 0 0 1 1 0 0 0 1 1 0 1 1 0 1 0 0 0 0 1 0
 1 1 1 1 0 1 1 1 0 0 1 0 1 0 0 1 0 1 1 1 0 0 1 0 0 1 0 1 1 0 0 0 1 1 1 0 1
 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 1 1 1 1 1 0 1 1 0 0 1 1 1
 0 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 0 0
 1 0 1 0 1 0 1 1 1 0 0 0 

723    0
349    0
297    0
199    0
238    0
713    0
182    0
368    1
643    1
620    0
165    1
300    1
22     1
703    0
172    1
107    1
176    0
646    0
852    0
877    0
266    0
179    0
130    0
549    1
222    0
584    0
609    1
854    0
596    1
796    1
      ..
227    0
292    0
10     1
462    0
77     0
866    1
207    1
688    0
229    0
678    0
546    1
56     1
600    1
837    0
520    1
478    0
695    0
379    0
785    0
152    0
347    1
313    0
669    1
758    0
140    0
57     0
348    1
381    1
592    0
555    0
Name: Survived, dtype: int64

In [12]:
# loop over each row
print(x['Survived'].values)

[0 0 0 0 0 0 0 1 1 0 1 1 1 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 1 0 0 1 0
 0 0 1 0 0 1 1 1 1 1 0 0 1 1 1 1 0 1 1 0 0 0 0 1 1 1 0 1 0 1 0 1 1 0 1 0 0
 0 1 0 0 0 1 0 1 0 0 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1
 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 1 0 0 0 1 1 0
 0 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 1 0 1 0 0 1 1 0 1 0 0
 1 1 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 1 0 1 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 0
 1 0 0 1 0 1 0 0 0 1 0 1 1 0 1 0 1 1 1 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 1 1 0
 1 1 0 0 1 0 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 1 1 1 0 0
 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 1 0 0 1 1 1 0 0 1 0 1 1 0 1 0 1 0 0 0 0
 1 0 0 0 1 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0 1 0 0 1 1 0 1 0 0 1 0 1 1 0 1 0
 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 1 0 0 0 1 1 1 1
 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 0 0 1 0 1 1 0 0 0
 1 0 0 0 0 0 0 1 0 0 0 1 

In [13]:
tp = 0
fp = 0
fn = 0
for i in range(len(results)):
    diff = x['Survived'].values[i] - results[i]
    
    if diff == 1:
        fp += 1
    elif diff == 0:
        tp += 1
    else:
        fn += 1
        
print(tp, fp, fn)

363 136 213


In [14]:
# print(tp, fp, fn)
# print(type(tp))
f1 = f1_score(tp, fn, fp)
print(f1)

0.6753488372093024


## Conclusions

K-Means algoritm predicts 65.5% correct results.

In [16]:
df_result = pandas.DataFrame(results, columns=['Survived'])
df_result.to_csv('titanic_day2.csv', index=False)