## Preliminaries

In [4]:
# Install a conda package in the current Jupyter kernel
# import sys
# !conda install --yes --prefix {sys.prefix} seaborn

In [5]:
# Install a pip package in the current Jupyter kernel
# import sys
# !{sys.executable} -m pip install scipy

In [6]:
# !echo $PATH

In [8]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn import datasets
from sklearn.preprocessing import StandardScaler

from sklearn import preprocessing

## Load Titanic: Machine Learning from Disaster


In [9]:
df_train = pd.read_csv('../train.csv')

In [10]:
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


### Change data types for categorical features

In [11]:
df_train['Sex'].astype('category')
df_train['Pclass'].astype('category')
df_train['Embarked'].astype('category')

0      S
1      C
2      S
3      S
4      S
5      Q
6      S
7      S
8      S
9      C
10     S
11     S
12     S
13     S
14     S
15     S
16     Q
17     S
18     S
19     C
20     S
21     S
22     Q
23     S
24     S
25     S
26     C
27     S
28     Q
29     S
      ..
861    S
862    S
863    S
864    S
865    S
866    C
867    S
868    S
869    S
870    S
871    S
872    S
873    S
874    C
875    C
876    S
877    S
878    S
879    C
880    S
881    S
882    S
883    S
884    S
885    Q
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: category
Categories (3, object): [C, Q, S]

## Preprocessing

In [12]:
df_train.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

### Count grouped by PassengerID and survival

In [13]:
df_train['PassengerId'].groupby(df_train['Survived']).count()

Survived
0    549
1    342
Name: PassengerId, dtype: int64

### Count grouped by Pclass and survival

In [14]:
df_train['PassengerId'].groupby([df_train['Pclass'], df_train['Survived']]).count()

Pclass  Survived
1       0            80
        1           136
2       0            97
        1            87
3       0           372
        1           119
Name: PassengerId, dtype: int64

### Count grouped by Sex and survival

In [15]:
df_train['PassengerId'].groupby([df_train['Sex'], df_train['Survived']] ).count()

Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: PassengerId, dtype: int64

### Count grouped by Age and survival

### Filtered Age for missing data

In [16]:
df_train_age = df_train[df_train.Age.isna() == False]

In [17]:
df_train_age['PassengerId'].groupby(df_train_age['Age']).count().sort_values(ascending = False)

Age
24.00    30
22.00    27
18.00    26
30.00    25
19.00    25
28.00    25
21.00    24
25.00    23
36.00    22
29.00    20
26.00    18
27.00    18
32.00    18
35.00    18
16.00    17
31.00    17
23.00    15
33.00    15
20.00    15
34.00    15
39.00    14
17.00    13
42.00    13
40.00    13
45.00    12
38.00    11
2.00     10
4.00     10
50.00    10
48.00     9
         ..
28.50     2
30.50     2
40.50     2
45.50     2
55.00     2
0.83      2
57.00     2
59.00     2
63.00     2
64.00     2
0.75      2
70.00     2
71.00     2
10.00     2
0.92      1
0.67      1
80.00     1
12.00     1
14.50     1
20.50     1
23.50     1
24.50     1
74.00     1
34.50     1
36.50     1
53.00     1
55.50     1
66.00     1
70.50     1
0.42      1
Name: PassengerId, Length: 88, dtype: int64

In [18]:
df_train_age['PassengerId'].groupby([df_train_age['Age'], df_train_age['Survived']] ).count().sort_values(ascending = False)

Age    Survived
21.00  0           19
28.00  0           18
25.00  0           17
18.00  0           17
19.00  0           16
22.00  0           16
24.00  1           15
       0           15
30.00  0           15
20.00  0           12
26.00  0           12
29.00  0           12
27.00  1           11
16.00  0           11
35.00  1           11
22.00  1           11
36.00  0           11
       1           11
23.00  0           10
30.00  1           10
31.00  0            9
32.00  0            9
33.00  0            9
34.00  0            9
32.00  1            9
39.00  0            9
19.00  1            9
18.00  1            9
29.00  1            8
31.00  1            8
                   ..
30.50  0            2
40.50  0            2
7.00   1            1
0.67   1            1
3.00   0            1
6.00   0            1
0.92   1            1
80.00  1            1
11.00  1            1
12.00  1            1
70.50  0            1
66.00  0            1
55.50  0            1
55.00  1        

### Count grouped by # of siblings and survival

In [19]:
df_train['PassengerId'].groupby([df_train['SibSp'], df_train['Survived']] ).count()

SibSp  Survived
0      0           398
       1           210
1      0            97
       1           112
2      0            15
       1            13
3      0            12
       1             4
4      0            15
       1             3
5      0             5
8      0             7
Name: PassengerId, dtype: int64

### Count grouped by # of Parents and survival

In [20]:
df_train['PassengerId'].groupby([df_train['Parch'], df_train['Survived']] ).count()

Parch  Survived
0      0           445
       1           233
1      0            53
       1            65
2      0            40
       1            40
3      0             2
       1             3
4      0             4
5      0             4
       1             1
6      0             1
Name: PassengerId, dtype: int64

### Count grouped by cabin and survival

In [21]:
df_train_cabin = df_train[df_train.Cabin.isna() == False]

In [22]:
df_train_cabin['PassengerId'].groupby(df_train_cabin['Cabin']).count().sort_values(ascending = False)

Cabin
B96 B98            4
G6                 4
C23 C25 C27        4
F2                 3
C22 C26            3
D                  3
F33                3
E101               3
C125               2
D33                2
B58 B60            2
D36                2
B49                2
D26                2
B5                 2
D20                2
D17                2
B51 B53 B55        2
B57 B59 B63 B66    2
D35                2
B77                2
C93                2
C92                2
B28                2
C83                2
C78                2
C68                2
C65                2
C52                2
C2                 2
                  ..
D47                1
D45                1
C148               1
D37                1
C30                1
C32                1
C45                1
C46                1
C47                1
C49                1
C50                1
C54                1
C7                 1
C70                1
C82                1
C85                1
C86    

In [23]:
df_train_cabin['Cabin'].groupby([df_train_cabin['Cabin'], df_train_cabin['Survived']] ).count()

Cabin  Survived
A10    0           1
A14    0           1
A16    1           1
A19    0           1
A20    1           1
A23    1           1
A24    0           1
A26    1           1
A31    1           1
A32    0           1
A34    1           1
A36    0           1
A5     0           1
A6     1           1
A7     0           1
B101   1           1
B102   0           1
B18    1           2
B19    0           1
B20    1           2
B22    0           1
       1           1
B28    1           2
B3     1           1
B30    0           1
B35    1           2
B37    0           1
B38    0           1
B39    1           1
B4     1           1
                  ..
E25    1           2
E31    0           1
E33    1           2
E34    1           1
E36    1           1
E38    0           1
E40    1           1
E44    0           1
       1           1
E46    0           1
E49    1           1
E50    1           1
E58    0           1
E63    0           1
E67    0           1
       1          

### Count grouped by embarked and survival

In [24]:
df_train_embarked = df_train[df_train.Embarked.isna() == False]

In [25]:
df_train_embarked['Embarked'].groupby([df_train_embarked['Embarked'], df_train_embarked['Survived']] ).count()

Embarked  Survived
C         0            75
          1            93
Q         0            47
          1            30
S         0           427
          1           217
Name: Embarked, dtype: int64

In [26]:
# Load data with only certain features
X = df_train[['Pclass', 'Sex', 'Age','SibSp','Parch', 'Embarked']][df_train.Age.isna() == False]
y = df_train.Survived[df_train.Age.isna() == False]

### Convert categorial data to numerical code

### Sex
male = 1
female = 0

### Embarked
C = 0
Q = 1
S = 2

In [27]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,male,22.0,1,0,S
1,1,female,38.0,1,0,C
2,3,female,26.0,0,0,S
3,1,female,35.0,1,0,S
4,3,male,35.0,0,0,S


In [28]:
X[X.isnull().any(axis=1)]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
61,1,female,38.0,0,0,
829,1,female,62.0,0,0,


In [29]:
X["Embarked"].value_counts()

S    554
C    130
Q     28
Name: Embarked, dtype: int64

For the sake of simplicity, just fill in the value with the S (since that is the most common value)

In [30]:
X = X.fillna({"Embarked": "S"})

It worked

In [31]:
X[X.isnull().any(axis=1)]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked


In [32]:
X.dtypes

Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Embarked     object
dtype: object

Convert the objects to category to encode

In [33]:
X["Sex"] = X["Sex"].astype('category')
X["Embarked"] = X["Embarked"].astype('category')
X.dtypes

Pclass         int64
Sex         category
Age          float64
SibSp          int64
Parch          int64
Embarked    category
dtype: object

In [34]:
X["Sex_cat"] = X["Sex"].cat.codes
X["Embarked_cat"] = X["Embarked"].cat.codes
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,Sex_cat,Embarked_cat
0,3,male,22.0,1,0,S,1,2
1,1,female,38.0,1,0,C,0,0
2,3,female,26.0,0,0,S,0,2
3,1,female,35.0,1,0,S,0,2
4,3,male,35.0,0,0,S,1,2


In [35]:
X_encoded = X[['Pclass', 'Sex_cat', 'Age','SibSp','Parch', 'Embarked_cat']]

### Standardize Features

In [36]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X_encoded)

In [38]:
print('Dimension of data is ', np.shape(X_std))
print('Dimension of target is ', np.shape(y))

Dimension of data is  (714, 6)
Dimension of target is  (714,)


### Load Testing data set

In [41]:
df_test = pd.read_csv('../test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [42]:
print(np.shape(df_test))

(418, 11)


In [43]:
X_test = df_test[['Pclass', 'Sex', 'Age', 'SibSp','Parch', 'Embarked']][df_test.Age.isna() == False]
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,male,34.5,0,0,Q
1,3,female,47.0,1,0,S
2,2,male,62.0,0,0,Q
3,3,male,27.0,0,0,S
4,3,female,22.0,1,1,S


In [45]:
X_test[X_test.isnull().any(axis=1)]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked


In [46]:
X_test.dtypes

Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Embarked     object
dtype: object

In [47]:
X_test["Sex"] = X_test["Sex"].astype('category')
X_test["Embarked"] = X_test["Embarked"].astype('category')
X_test.dtypes

Pclass         int64
Sex         category
Age          float64
SibSp          int64
Parch          int64
Embarked    category
dtype: object

In [48]:
X_test["Sex_cat"] = X_test["Sex"].cat.codes
X_test["Embarked_cat"] = X_test["Embarked"].cat.codes
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,Sex_cat,Embarked_cat
0,3,male,34.5,0,0,Q,1,1
1,3,female,47.0,1,0,S,0,2
2,2,male,62.0,0,0,Q,1,1
3,3,male,27.0,0,0,S,1,2
4,3,female,22.0,1,1,S,0,2


In [49]:
X_test_encoded = X_test[['Pclass', 'Sex_cat', 'Age','SibSp','Parch', 'Embarked_cat']]
X_test_encoded.head()

Unnamed: 0,Pclass,Sex_cat,Age,SibSp,Parch,Embarked_cat
0,3,1,34.5,0,0,1
1,3,0,47.0,1,0,2
2,2,1,62.0,0,0,1
3,3,1,27.0,0,0,2
4,3,0,22.0,1,1,2


In [50]:
X_test_std = scaler.fit_transform(X_test_encoded)
X_test_std

array([[ 1.01232494,  0.78709097,  0.29854934, -0.55218394, -0.49119871,
        -0.5110897 ],
       [ 1.01232494, -1.27050117,  1.18132793,  0.59359773, -0.49119871,
         0.65111428],
       [-0.17109717,  0.78709097,  2.24066224, -0.55218394, -0.49119871,
        -0.5110897 ],
       ...,
       [ 1.01232494, -1.27050117, -0.16049553, -0.55218394, -0.49119871,
         0.65111428],
       [-1.35451929, -1.27050117,  0.61634963, -0.55218394, -0.49119871,
        -1.67329369],
       [ 1.01232494,  0.78709097,  0.58103849, -0.55218394, -0.49119871,
         0.65111428]])

### Save preprocessed data

In [52]:
np.savetxt('data_preproc_train.txt', X_std, fmt='%d')
np.savetxt('target_train.txt', y, fmt='%d')

In [51]:
np.savetxt('data_preproc_test.txt', X_test_std, fmt='%d')