In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import seaborn as sns

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

In [4]:
# Makes it worse, don't scale
minmax_scaler = MinMaxScaler(feature_range=(0,1))

In [23]:
titanic_train = pd.read_csv('data/train.csv')

## EDA / Cleaning

In [24]:
# Turn Sex into Bool values
titanic_train['Sex'] = titanic_train['Sex'].map(lambda x: 0 if x == 'female' else 1)

In [25]:
# Turn Embarked into dummy variables, drop first dummy variable
embarked_dummies = pd.get_dummies(titanic_train['Embarked'], drop_first=True, prefix='Embarked_')

titanic_train = pd.concat([titanic_train, embarked_dummies], axis=1)

In [26]:
# drop unnecessary columns: ['Embarked', 'Name', 'Ticket', 'Cabin', 'Survived']
titanic_train.drop(columns=['Embarked', 'Name', 'Ticket', 'Cabin', 'Survived'], inplace=True)

In [27]:
# Scale Fare values
# titanic_train['scaler_fare'] = minmax_scaler.fit_transform(np.array(titanic_train[['Fare']]))

In [28]:
# Separate NaN values
titanic_age_null = pd.DataFrame(titanic_train[titanic_train.isnull().any(axis=1)])
titanic_age_null.drop(columns='Age', inplace=True)
titanic_age_null.head()

Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Fare,Embarked__Q,Embarked__S
5,6,3,1,0,0,8.4583,1,0
17,18,2,1,0,0,13.0,0,1
19,20,3,0,0,0,7.225,0,0
26,27,3,1,0,0,7.225,0,0
28,29,3,0,0,0,7.8792,1,0


In [29]:
# Drop NaN and scale Age values
titanic_train.dropna(inplace=True)
# titanic_train['scaler_age'] = minmax_scaler.fit_transform(np.array(titanic_train[['Age']]))

In [30]:
titanic_train

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked__Q,Embarked__S
0,1,3,1,22.0,1,0,7.2500,0,1
1,2,1,0,38.0,1,0,71.2833,0,0
2,3,3,0,26.0,0,0,7.9250,0,1
3,4,1,0,35.0,1,0,53.1000,0,1
4,5,3,1,35.0,0,0,8.0500,0,1
...,...,...,...,...,...,...,...,...,...
885,886,3,0,39.0,0,5,29.1250,1,0
886,887,2,1,27.0,0,0,13.0000,0,1
887,888,1,0,19.0,0,0,30.0000,0,1
889,890,1,1,26.0,0,0,30.0000,0,0


In [31]:
# Separate and split X and y values
X = titanic_train.drop(columns='Age')
y = titanic_train['Age']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [32]:
X

Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Fare,Embarked__Q,Embarked__S
0,1,3,1,1,0,7.2500,0,1
1,2,1,0,1,0,71.2833,0,0
2,3,3,0,0,0,7.9250,0,1
3,4,1,0,1,0,53.1000,0,1
4,5,3,1,0,0,8.0500,0,1
...,...,...,...,...,...,...,...,...
885,886,3,0,0,5,29.1250,1,0
886,887,2,1,0,0,13.0000,0,1
887,888,1,0,0,0,30.0000,0,1
889,890,1,1,0,0,30.0000,0,0


### Creating Model to Predict Age NaN Values

In [33]:
lr_model = LinearRegression().fit(X_train,y_train)

In [34]:
lr_model.score(X_test,y_test)

0.22461494839524598

In [35]:
gb_model = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01)

In [36]:
gb_model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=500,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [37]:
gb_model.score(X_test, y_test)

0.26588403713361997

In [None]:
y_predict = list(np.around(np.array(gb_model.predict(X_test)),2))