In [99]:
import datetime
import pandas as pd

from google.cloud import storage

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder

BUCKET_NAME = 'sandbox-226501-mlengine'


In [100]:
# Public bucket holding the census data
bucket = storage.Client().bucket(BUCKET_NAME)

# Path to the data inside the public bucket
blob = bucket.blob('titanic-train.csv')
# Download the data
blob.download_to_filename('train.csv')

# Path to the data inside the public bucket
blob = bucket.blob('titanic-test.csv')
# Download the data
blob.download_to_filename('test.csv')

In [101]:
with open('./train.csv', 'r') as train_data:
    train = pd.read_csv(train_data)
with open('./test.csv', 'r') as train_data:
    test = pd.read_csv(train_data)


In [102]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [103]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [104]:
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

In [105]:
train['Sex'] = train['Sex'].map({'male':1, 'female':0, 1:1, 0:0})
test['Sex'] = train['Sex'].map({'male':1, 'female':0, 1:1, 0:0})

In [110]:
from sklearn.preprocessing import OneHotEncoder

train = pd.concat([train, pd.get_dummies(train['Embarked'], prefix='Embarked')],axis=1)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.2500,S,0,0,1
1,1,1,0,38.0,1,0,71.2833,C,1,0,0
2,1,3,0,26.0,0,0,7.9250,S,0,0,1
3,1,1,0,35.0,1,0,53.1000,S,0,0,1
4,0,3,1,35.0,0,0,8.0500,S,0,0,1
5,0,3,1,,0,0,8.4583,Q,0,1,0
6,0,1,1,54.0,0,0,51.8625,S,0,0,1
7,0,3,1,2.0,3,1,21.0750,S,0,0,1
8,1,3,0,27.0,0,2,11.1333,S,0,0,1
9,1,2,0,14.0,1,0,30.0708,C,1,0,0


In [81]:
train['Embarked'] = train['Embarked'].fillna('U')
test['Embarked'] = test['Embarked'].fillna('U')

In [82]:
le = LabelEncoder()
train['Embarked'] = le.fit_transform(train['Embarked'])
test['Embarked'] = le.transform(test['Embarked'])

In [83]:
le.classes_

array(['C', 'Q', 'S', 'U'], dtype=object)

In [84]:
le.transform(['C','Q','S','U'])

array([0, 1, 2, 3])

In [85]:
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [86]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [87]:
train_not_null = train[-train['Age'].isnull()]

In [88]:
X = train_not_null.drop(['Age','Survived'], axis=1)
y = train_not_null['Age']
age_regressor = RandomForestRegressor(n_estimators=100)
age_regressor.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [89]:
train.loc[train['Age'].isnull(), 'Age'] = age_regressor.predict(train.loc[train['Age'].isnull()].drop(['Age','Survived'],axis=1))


In [90]:
test.loc[test['Age'].isnull(), 'Age'] = age_regressor.predict(test.loc[test['Age'].isnull()].drop(['Age'],axis=1))


In [96]:
median_fare = train['Fare'].median()
test['Fare'].fillna(median_fare, inplace=True)

In [97]:
train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [98]:
test.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64