# Objective

Predict the survival of Titanic passengers using a K-Means algorithm.

## Data Analysis

### Data Import

In [1]:
import pandas
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.cluster import KMeans
from pprint import pprint

TITANIC_TRAIN = 'train.csv'
TITANIC_TEST = 'test.csv'
# t_df refers to titanic_dataframe
t_df = pandas.read_csv(TITANIC_TRAIN, header=0)

## Selection of Features

In [2]:
t_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked', 'Sex'], axis=1, inplace=True)
t_df.info()
t_df.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
dtypes: float64(2), int64(5)
memory usage: 48.8 KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.0,1,0,7.25


## Cleaning Data

In [3]:
t_df.Age.fillna(np.mean(t_df.Age), inplace=True)
t_df.info()
t_df.head(5)
t_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
dtypes: float64(2), int64(5)
memory usage: 48.8 KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [25]:
test_df = pandas.read_csv(TITANIC_TEST, header=0)
test_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked', 'Sex'], axis=1, inplace=True)
test_df.info()
test_df.head(1)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           417 non-null float64
dtypes: float64(2), int64(4)
memory usage: 19.7 KB


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,892,3,34.5,0,0,7.8292


In [26]:
test_df.Age.fillna(np.mean(test_df.Age), inplace=True)
test_df.Fare.fillna(np.mean(test_df.Fare), inplace=True)
test_df.info()
test_df.head(5)
test_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
dtypes: float64(2), int64(4)
memory usage: 19.7 KB


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,418.0,418.0,418.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,12.634534,0.89676,0.981429,55.8405
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,23.0,0.0,0.0,7.8958
50%,1100.5,3.0,30.27259,0.0,0.0,14.4542
75%,1204.75,3.0,35.75,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


## Experiment Heueristics (Design)

### Evaluation Function Declarations

F1 score to be used to evaluate algoritm results.

In [4]:
def precision(tp, fp):
    return tp / (tp + fp)
    
def recall(tp, fn):
    return tp / (tp + fn)
    
def f1_score(tp, fn, fp):
    pre = precision(tp, fp)
    rec = recall(tp, fn)
    return (2 * ((pre * rec) / (pre + rec)))

### Representation

In [5]:
train, test = train_test_split(t_df, test_size = 0.2)
y = train['Survived']
# y = test
# x = train['Age'].values[:,np.newaxis]
x = train
print(x)
# print(len(train), len(train['Survived']), len(x))

     PassengerId  Survived  Pclass        Age  SibSp  Parch      Fare
419          420         0       3  10.000000      0      2   24.1500
367          368         1       3  29.699118      0      0    7.2292
111          112         0       3  14.500000      1      0   14.4542
798          799         0       3  30.000000      0      0    7.2292
761          762         0       3  41.000000      0      0    7.1250
562          563         0       2  28.000000      0      0   13.5000
829          830         1       1  62.000000      0      0   80.0000
133          134         1       2  29.000000      1      0   26.0000
847          848         0       3  35.000000      0      0    7.8958
722          723         0       2  34.000000      0      0   13.0000
806          807         0       1  39.000000      0      0    0.0000
117          118         0       2  29.000000      1      0   21.0000
803          804         1       3   0.420000      0      1    8.5167
862          863    

## Experiment

In [23]:
k = 2
# x.values.reshape(-1, 1)
kmeans = KMeans(n_clusters=k)
results = kmeans.fit_predict(x.values, y.values)
print(results) 
train['Survived']

[1 1 1 0 0 0 0 1 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 1 1 1 0 1 1 0 1 0 1 0 1 0 1 1 0 0 1 1 0 1 0 1 0 0 1 0 0 0 1 0 1
 1 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0
 1 1 1 0 1 1 1 0 1 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1 1 1 0 0 0 0 0 1 0 0 1 0 1
 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1
 0 1 1 1 1 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 1 1
 0 0 1 1 1 0 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 0
 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 1 0 1 0 1 0 1 1 0 0 1 1 1 0 1 0 1
 1 1 1 0 0 1 1 1 0 1 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 1 0 0 1 1 1 1 1 1 1 0
 1 0 0 0 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 0 0 1 0 1 0 1 1 0
 0 0 1 1 0 0 0 1 1 0 0 1 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0
 1 0 0 1 0 0 1 1 1 1 1 0 0 1 1 1 1 0 1 0 0 0 1 1 1 1 0 1 1 1 0 1 1 0 0 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 0 0 1 1 0 0 1 1 1 0 0 1 1
 0 0 1 1 1 1 1 0 0 1 0 0 

419    0
367    1
111    0
798    0
761    0
562    0
829    1
133    1
847    0
722    0
806    0
117    0
803    1
862    1
786    1
268    1
175    0
35     0
304    0
598    0
686    0
587    1
350    0
91     0
182    0
146    1
679    1
849    1
461    0
740    1
      ..
629    0
312    0
486    1
561    0
116    0
105    0
331    0
1      1
434    0
141    1
45     0
127    1
213    0
273    0
237    1
443    1
400    1
559    1
407    1
265    0
290    1
802    1
362    0
276    0
147    0
834    0
718    0
404    0
610    0
196    0
Name: Survived, dtype: int64

In [12]:
# loop over each row
print(x['Survived'].values)
print(x['PassengerId'].values)

[0 1 0 0 0 0 1 1 0 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 1 1 1 0 1 0 0 0 1 1 0 0
 0 1 1 0 1 0 0 1 1 0 1 0 1 0 1 0 1 1 0 0 1 1 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1
 0 0 1 1 0 1 0 1 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 0 0 1 1 1 0
 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 0 1 0
 1 1 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0
 1 1 1 0 0 0 0 0 0 0 1 0 0 1 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 1 0 1 0
 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 1 1 1 1 0 0 1 1 0 0 0 0
 1 1 0 1 1 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1
 0 1 1 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0
 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0 1 0 1 1 0 1 0 0 0 1 0 0 1 1 0 0 1
 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 0 1
 1 0 1 0 1 1 0 0 0 0 1 0 1 0 0 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0 0 0 0 1 1 1
 0 1 1 0 1 1 1 1 0 1 1 0 

In [8]:
tp = 0
fp = 0
fn = 0
for i in range(len(results)):
    diff = x['Survived'].values[i] - results[i]
    
    if diff == 1:
        fp += 1
    elif diff == 0:
        tp += 1
    else:
        fn += 1
        
print(tp, fp, fn)

342 148 222


In [9]:
# print(tp, fp, fn)
# print(type(tp))
f1 = f1_score(tp, fn, fp)
print(f1)

0.6489563567362429


## Conclusions

K-Means algoritm predicts 68.5% correct results.

In [15]:
df_result = pandas.DataFrame(results, columns=['Survived'])
df_result.to_csv('titanic_day2.csv', index=False)