# INITIAL CLEANING

In [42]:
import pandas as pd
import numpy as np

In [43]:
df = pd.read_csv('data/source/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Initial Drop

In [44]:
column_drop = [
                'PassengerId',
                'Ticket',
                'Name',
              ]
df.drop(column_drop, axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


## Cabin Manipulation

In [45]:
cabin_map = lambda x: str(x[0]) if not pd.isnull(x) else np.NaN
df['Cabin'] = df['Cabin'].apply(cabin_map)
df = pd.concat([df, pd.get_dummies(df['Cabin'], prefix='Cabin')], axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T
0,0,3,male,22.0,1,0,7.25,,S,0,0,0,0,0,0,0,0
1,1,1,female,38.0,1,0,71.2833,C,C,0,0,1,0,0,0,0,0
2,1,3,female,26.0,0,0,7.925,,S,0,0,0,0,0,0,0,0
3,1,1,female,35.0,1,0,53.1,C,S,0,0,1,0,0,0,0,0
4,0,3,male,35.0,0,0,8.05,,S,0,0,0,0,0,0,0,0


## Sex Manipulation

In [46]:
gender_map = lambda x: 1 if str(x) == 'female' else 0
df['Sex'] = df['Sex'].apply(gender_map)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T
0,0,3,0,22.0,1,0,7.25,,S,0,0,0,0,0,0,0,0
1,1,1,1,38.0,1,0,71.2833,C,C,0,0,1,0,0,0,0,0
2,1,3,1,26.0,0,0,7.925,,S,0,0,0,0,0,0,0,0
3,1,1,1,35.0,1,0,53.1,C,S,0,0,1,0,0,0,0,0
4,0,3,0,35.0,0,0,8.05,,S,0,0,0,0,0,0,0,0


## Embarked Manipulation

In [47]:
df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Emb')], axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Emb_C,Emb_Q,Emb_S
0,0,3,0,22.0,1,0,7.25,,S,0,0,0,0,0,0,0,0,0,0,1
1,1,1,1,38.0,1,0,71.2833,C,C,0,0,1,0,0,0,0,0,1,0,0
2,1,3,1,26.0,0,0,7.925,,S,0,0,0,0,0,0,0,0,0,0,1
3,1,1,1,35.0,1,0,53.1,C,S,0,0,1,0,0,0,0,0,0,0,1
4,0,3,0,35.0,0,0,8.05,,S,0,0,0,0,0,0,0,0,0,0,1


## Fare Manipulation

In [48]:
df['Fare_round'] = df['Fare'].apply(np.around)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Cabin_A,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Emb_C,Emb_Q,Emb_S,Fare_round
0,0,3,0,22.0,1,0,7.25,,S,0,...,0,0,0,0,0,0,0,0,1,7.0
1,1,1,1,38.0,1,0,71.2833,C,C,0,...,1,0,0,0,0,0,1,0,0,71.0
2,1,3,1,26.0,0,0,7.925,,S,0,...,0,0,0,0,0,0,0,0,1,8.0
3,1,1,1,35.0,1,0,53.1,C,S,0,...,1,0,0,0,0,0,0,0,1,53.0
4,0,3,0,35.0,0,0,8.05,,S,0,...,0,0,0,0,0,0,0,0,1,8.0


In [49]:
column_drop_final = [
                'Cabin',
                'Embarked',
              ]
df.drop(column_drop_final, axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Emb_C,Emb_Q,Emb_S,Fare_round
0,0,3,0,22.0,1,0,7.25,0,0,0,0,0,0,0,0,0,0,1,7.0
1,1,1,1,38.0,1,0,71.2833,0,0,1,0,0,0,0,0,1,0,0,71.0
2,1,3,1,26.0,0,0,7.925,0,0,0,0,0,0,0,0,0,0,1,8.0
3,1,1,1,35.0,1,0,53.1,0,0,1,0,0,0,0,0,0,0,1,53.0
4,0,3,0,35.0,0,0,8.05,0,0,0,0,0,0,0,0,0,0,1,8.0


In [50]:
df.isnull().any()

Survived      False
Pclass        False
Sex           False
Age            True
SibSp         False
Parch         False
Fare          False
Cabin_A       False
Cabin_B       False
Cabin_C       False
Cabin_D       False
Cabin_E       False
Cabin_F       False
Cabin_G       False
Cabin_T       False
Emb_C         False
Emb_Q         False
Emb_S         False
Fare_round    False
dtype: bool

In [51]:
df.to_csv('data/temp/stage1.csv', index=False)

# DATA INTERPOLATION

In [52]:
from sklearn.linear_model import LinearRegression

In [53]:
df_interpol = df
df_interpol.dropna(inplace=True)
df_interpol.reset_index()
df_interpol.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Cabin_A       0
Cabin_B       0
Cabin_C       0
Cabin_D       0
Cabin_E       0
Cabin_F       0
Cabin_G       0
Cabin_T       0
Emb_C         0
Emb_Q         0
Emb_S         0
Fare_round    0
dtype: int64

In [54]:
regr = LinearRegression()

input_list = [
    'Survived',
    'Pclass',
    'Sex',
    'Sibsp',
    'Parch',
    'Fare',
    'Emb_C',
    'Emb_Q',
    'Emb_S'
]

# Train the model using the training sets
regr.fit(df_interpol.loc[:,input_list], df_interpol.loc[:,['Age']])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  app.launch_new_instance()
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').