# Echo Pod KNN


In [1]:
import pandas
import numpy as np
import scipy.stats as sp
from sklearn import neighbors
from sklearn.neighbors import DistanceMetric
from pprint import pprint

titanic_data = pandas.read_csv("train.csv", header=0)
titanic_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Removing Excess Columns

In [2]:
titanic_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
titanic_data.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S


In [3]:
titanic_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB


## Pre-Processing

Taking discrete values and making them integers.

In [5]:
titanic_data['Embarked'].unique()
titanic_data['Port'] = titanic_data['Embarked'].map({'C':1, 'S':2, 'Q':3}).astype(float)
titanic_data['Sex'].unique()
titanic_data['Gender'] = titanic_data['Sex'].map({'female': 0, 'male': 1}).astype(int)
titanic_data = titanic_data.drop(['Sex', 'Embarked'], axis=1)
titanic_data.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Port,Gender
0,1,0,3,22.0,1,0,7.25,2.0,1


In [6]:
cols = titanic_data.columns.tolist()
print(cols)
cols = [cols[1]] + cols[0:1] + cols[2:]
print(cols)
titanic_data = titanic_data[cols]
print(titanic_data.head(2))
train_data = titanic_data[cols[2: ]]
train_target = titanic_data[cols[1]]
print(train_target.head(2))

pprint('column_list: {0}'.format(cols))


['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Port', 'Gender']
['Survived', 'PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Port', 'Gender']
   Survived  PassengerId  Pclass   Age  SibSp  Parch     Fare  Port  Gender
0         0            1       3  22.0      1      0   7.2500   2.0       1
1         1            2       1  38.0      1      0  71.2833   1.0       0
0    1
1    2
Name: PassengerId, dtype: int64
("column_list: ['Survived', 'PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', "
 "'Fare', 'Port', 'Gender']")


In [7]:
df_test = pandas.read_csv('test.csv')

df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)


df_test['Gender'] = df_test['Sex'].map({'female': 0, 'male': 1}).astype(int)
df_test['Port'] = df_test['Embarked'].map({'C':1, 'S':2, 'Q':3}).astype(int)

df_test = df_test.drop(['Sex', 'Embarked'], axis=1)
df_test.Fare.fillna(np.mean(df_test.Fare), inplace=True)
df_test.Age.fillna(np.mean(df_test.Age), inplace=True)
df_test.info()
test_data = df_test.values
test_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Gender         418 non-null int64
Port           418 non-null int64
dtypes: float64(2), int64(6)
memory usage: 26.2 KB


array([[  8.92000000e+02,   3.00000000e+00,   3.45000000e+01, ...,
          7.82920000e+00,   1.00000000e+00,   3.00000000e+00],
       [  8.93000000e+02,   3.00000000e+00,   4.70000000e+01, ...,
          7.00000000e+00,   0.00000000e+00,   2.00000000e+00],
       [  8.94000000e+02,   2.00000000e+00,   6.20000000e+01, ...,
          9.68750000e+00,   1.00000000e+00,   3.00000000e+00],
       ..., 
       [  1.30700000e+03,   3.00000000e+00,   3.85000000e+01, ...,
          7.25000000e+00,   1.00000000e+00,   2.00000000e+00],
       [  1.30800000e+03,   3.00000000e+00,   3.02725904e+01, ...,
          8.05000000e+00,   1.00000000e+00,   2.00000000e+00],
       [  1.30900000e+03,   3.00000000e+00,   3.02725904e+01, ...,
          2.23583000e+01,   1.00000000e+00,   1.00000000e+00]])

## Normalize & Fill

In [8]:
titanic_data.Age.fillna(np.mean(titanic_data.Age), inplace=True)
titanic_data.Port.fillna(3.0, inplace=True)

train_data = titanic_data[cols[2: ]]
train_target = titanic_data[cols[1]]

print(titanic_data.Age)                        
titanic_data.info()

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
5      29.699118
6      54.000000
7       2.000000
8      27.000000
9      14.000000
10      4.000000
11     58.000000
12     20.000000
13     39.000000
14     14.000000
15     55.000000
16      2.000000
17     29.699118
18     31.000000
19     29.699118
20     35.000000
21     34.000000
22     15.000000
23     28.000000
24      8.000000
25     38.000000
26     29.699118
27     19.000000
28     29.699118
29     29.699118
         ...    
861    21.000000
862    48.000000
863    29.699118
864    24.000000
865    42.000000
866    27.000000
867    31.000000
868    29.699118
869     4.000000
870    26.000000
871    47.000000
872    33.000000
873    47.000000
874    28.000000
875    15.000000
876    20.000000
877    19.000000
878    29.699118
879    56.000000
880    25.000000
881    33.000000
882    22.000000
883    28.000000
884    25.000000
885    39.000000
886    27.000000
887    19.000000
888    29.6991

## SciKit Learn

In [9]:
model = neighbors.KNeighborsClassifier()
print(train_data.values)
train_data.info()
print(train_target.values)
print(train_data.values)
model.fit(train_data.values, train_target.values)
print(model.fit)

output = model.predict(test_data).astype(int)

result = np.c_[test_data[:,0].astype(int), output.astype(int)]
# predictions_file = open("myfirstforest.csv", "wb")
# open_file_object = csv.writer(predictions_file)
# open_file_object.writerow(["PassengerId","Survived"])
# open_file_object.writerows(zip(ids, output))
# predictions_file.close()
# print 'Done.'

df_result = pandas.DataFrame(result[:,0:2], columns=['PassengerId', 'Survived'])
df_result.to_csv('titanic.csv', index=False) 

[[  3.          22.           1.         ...,   7.25         2.           1.        ]
 [  1.          38.           1.         ...,  71.2833       1.           0.        ]
 [  3.          26.           0.         ...,   7.925        2.           0.        ]
 ..., 
 [  3.          29.69911765   1.         ...,  23.45         2.           0.        ]
 [  1.          26.           0.         ...,  30.           1.           1.        ]
 [  3.          32.           0.         ...,   7.75         3.           1.        ]]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass    891 non-null int64
Age       891 non-null float64
SibSp     891 non-null int64
Parch     891 non-null int64
Fare      891 non-null float64
Port      891 non-null float64
Gender    891 non-null int64
dtypes: float64(3), int64(4)
memory usage: 48.8 KB
[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  2

ValueError: query data dimension must match training data dimension