In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.style as mplstyle
mplstyle.use('ggplot')
%matplotlib inline

from sklearn.preprocessing import StandardScaler

# Second Attempt

## Loading the data

In [2]:
df = pd.read_csv('data/train.csv')

In [3]:
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


## Wrangling

In [4]:
cols = ['Sex', 'Cabin', 'Embarked', 'Ticket']

for i in cols:
    df[i] = df[i].astype('category')

## Scaling

Normalize each of these then add them together to create a new feature.

$$
Pclass + Sex + Age + Embarked
$$

In [5]:
df1 = df.copy()

In [6]:
cols

['Sex', 'Cabin', 'Embarked', 'Ticket']

In [7]:
for i in cols:
    df1[i] = df1[i].values.codes

In [8]:
df1.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,523,7.25,-1,2


In [9]:
scaler = StandardScaler()

In [10]:
data = df1[cols].values
data[:5]

array([[  1,  -1,   2, 523],
       [  0,  81,   0, 596],
       [  0,  -1,   2, 669],
       [  0,  55,   2,  49],
       [  1,  -1,   2, 472]], dtype=int16)

In [11]:
scaled = scaler.fit_transform(data)
scaled[:5]



array([[ 0.73769513, -0.4624902 ,  0.58796609,  0.91896631],
       [-1.35557354,  1.6886722 , -1.91264387,  1.28262456],
       [-1.35557354, -0.4624902 ,  0.58796609,  1.64628282],
       [-1.35557354,  1.00659632,  0.58796609, -1.44232155],
       [ 0.73769513, -0.4624902 ,  0.58796609,  0.66490369]])

In [12]:
df2 = pd.DataFrame(scaled, columns=cols)
df2.head()

Unnamed: 0,Sex,Cabin,Embarked,Ticket
0,0.737695,-0.46249,0.587966,0.918966
1,-1.355574,1.688672,-1.912644,1.282625
2,-1.355574,-0.46249,0.587966,1.646283
3,-1.355574,1.006596,0.587966,-1.442322
4,0.737695,-0.46249,0.587966,0.664904


In [13]:
for i in cols:
    df1[i] = df2[i]

In [14]:
df1.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0.737695,22.0,1,0,0.918966,7.25,-0.46249,0.587966


# Principal Components

In [20]:
principal_candidates = [i for i in df1.columns if i not in 'PassengerId Survived Name']
principal_candidates

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

## Null values

In [48]:
values = {i:np.mean(df1[i].values) for i in principal_candidates}
values

{'Age': nan,
 'Cabin': 1.9936664864200341e-17,
 'Embarked': 1.2958832161730222e-17,
 'Fare': 32.204207968574636,
 'Parch': 0.38159371492704824,
 'Pclass': 2.308641975308642,
 'Sex': -1.1563265621236197e-16,
 'SibSp': 0.52300785634118963,
 'Ticket': 6.7784660538281162e-17}

In [49]:
df2 = df1.fillna(value=values)

In [50]:
X = df2[principal_candidates].values
X[0]

array([  3.        ,   0.73769513,  22.        ,   1.        ,
         0.        ,   0.91896631,   7.25      ,  -0.4624902 ,   0.58796609])

In [51]:
from sklearn.decomposition import PCA

In [52]:
pca = PCA(n_components=2)

In [53]:
X2D = pca.fit_transform(X)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### I found no `np.nan` in `X.flatten()`. What's going on?

## Viz

Disregard this for now.

In [15]:
survival_colors = ['r', 'b']
colors = pd.Categorical.from_codes(df1['Survived'].values, survival_colors)
print(set(list(zip(colors[:5],df1['Survived'].head()))))

{('r', 0), ('b', 1)}


This means

- Red = Died
- Blue = Survived