In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
df = pd.read_csv('to_find_age.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 20 columns):
Unnamed: 0               1309 non-null int64
Age                      1046 non-null float64
Cabin                    1309 non-null object
Parch                    1309 non-null int64
Pclass                   1309 non-null int64
SibSp                    1309 non-null int64
Survived                 1309 non-null int64
C                        1309 non-null int64
Q                        1309 non-null int64
S                        1309 non-null int64
Cabin_letter             1309 non-null int64
title                    1309 non-null int64
sex                      1309 non-null int64
Fare_bin_id              1309 non-null int64
Fare_0                   1309 non-null int64
Fare_1                   1309 non-null int64
Fare_2                   1309 non-null int64
Fare_3                   1309 non-null int64
Fare_4                   1309 non-null int64
Pclass_Fare_Category2    1309 non-n

In [4]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,Age,Cabin,Parch,Pclass,SibSp,Survived,C,Q,S,Cabin_letter,title,sex,Fare_bin_id,Fare_0,Fare_1,Fare_2,Fare_3,Fare_4,Pclass_Fare_Category2
0,22.0,U0,0,3,1,0,0,0,1,0,0,1,0,1,0,0,0,0,5
1,38.0,C85,0,1,1,1,1,0,0,1,1,0,1,0,1,0,0,0,1
2,26.0,U0,0,3,0,1,0,0,1,0,2,0,2,0,0,1,0,0,5
3,35.0,C123,0,1,1,1,0,0,1,1,1,0,1,0,1,0,0,0,1
4,35.0,U0,0,3,0,0,0,0,1,0,0,1,2,0,0,1,0,0,5


In [6]:
missing_age_train = df[df['Age'].notnull()]
missing_age_test = df[df['Age'].isnull()]

In [7]:
missing_age_train.head()

Unnamed: 0,Age,Cabin,Parch,Pclass,SibSp,Survived,C,Q,S,Cabin_letter,title,sex,Fare_bin_id,Fare_0,Fare_1,Fare_2,Fare_3,Fare_4,Pclass_Fare_Category2
0,22.0,U0,0,3,1,0,0,0,1,0,0,1,0,1,0,0,0,0,5
1,38.0,C85,0,1,1,1,1,0,0,1,1,0,1,0,1,0,0,0,1
2,26.0,U0,0,3,0,1,0,0,1,0,2,0,2,0,0,1,0,0,5
3,35.0,C123,0,1,1,1,0,0,1,1,1,0,1,0,1,0,0,0,1
4,35.0,U0,0,3,0,0,0,0,1,0,0,1,2,0,0,1,0,0,5


In [8]:
X_train = missing_age_train[['Cabin_letter', 'C', 'Q', 'S', 'title', 'SibSp', 'Fare_0', 'Fare_1', 'Fare_2', 'Fare_3', 'Fare_4', 'Pclass_Fare_Category2']]
y_train = missing_age_train['Age']

In [9]:
X_test = missing_age_test[['Cabin_letter', 'C', 'Q', 'S', 'title', 'SibSp', 'Fare_0', 'Fare_1', 'Fare_2', 'Fare_3', 'Fare_4', 'Pclass_Fare_Category2']]

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [11]:
RF_para_grid = {'n_estimators' : [150, 200, 250,300], 
                'criterion': ['mse'],
                'max_depth' : [6,7,8],
                'max_features': ['auto',6,7,8]}

In [12]:
RF = RandomForestRegressor() 
RF_Grid = GridSearchCV(RF, RF_para_grid, cv = 5, scoring = 'neg_mean_squared_error')
RF_Grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [150, 200, 250, 300], 'criterion': ['mse'], 'max_depth': [6, 7, 8], 'max_features': ['auto', 6, 7, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [13]:
print('Age Feature Best RF Params:' + str(RF_Grid.best_params_))
print('Age Feature Best Score:' + str(RF_Grid.best_score_))

Age Feature Best RF Params:{'criterion': 'mse', 'max_depth': 7, 'max_features': 6, 'n_estimators': 200}
Age Feature Best Score:-118.67142752855567


In [14]:
RF_Grid_Result = pd.DataFrame(RF_Grid.cv_results_)



In [15]:
RF_Grid_Result = RF_Grid_Result[['params', 'rank_test_score', 'mean_test_score', 'mean_train_score']]

In [16]:
RF_Grid_Result.sort_values('rank_test_score', axis=0, ascending=True, inplace=True)

In [17]:
RF_Grid_Result.head()

Unnamed: 0,params,rank_test_score,mean_test_score,mean_train_score
21,"{'criterion': 'mse', 'max_depth': 7, 'max_feat...",1,-118.671428,-90.8896
26,"{'criterion': 'mse', 'max_depth': 7, 'max_feat...",2,-118.863129,-90.194273
9,"{'criterion': 'mse', 'max_depth': 6, 'max_feat...",3,-118.9104,-97.466195
23,"{'criterion': 'mse', 'max_depth': 7, 'max_feat...",4,-118.99378,-91.065944
27,"{'criterion': 'mse', 'max_depth': 7, 'max_feat...",5,-119.050369,-90.228684


In [18]:
RF_Grid_Result.to_csv('RF_Grid_Result.csv')

In [19]:
RF_Grid_Best = RandomForestRegressor(n_estimators = 150, criterion='mse', max_depth = 7, max_features = 6)

In [20]:
RF_Grid_Best.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=150, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

In [21]:
from sklearn.externals import joblib

In [23]:
joblib.dump(RF_Grid_Best, 'RF_BEST_AGE.pkl')

['RF_BEST_AGE.pkl']

In [24]:
from sklearn.svm import SVR

In [25]:
SVR_Grid_Param = [{'kernel' : ['rbf'], 'gamma': ['scale'],'C': [0.01, 0.1, 1, 5, 10, 20, 100]},
                  {'kernel' : ['poly'], 'degree': [4, 5, 6, 7], 'gamma': ['scale'],'coef0':[-1, 0, 1], 'C':[0.01, 0.1, 1, 5, 10, 20, 100]
                  }]

In [26]:
svr = SVR()

In [27]:
SVR_Grid = GridSearchCV(svr, SVR_Grid_Param, cv = 5, scoring = 'neg_mean_squared_error')
SVR_Grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'kernel': ['rbf'], 'gamma': ['scale'], 'C': [0.01, 0.1, 1, 5, 10, 20, 100]}, {'kernel': ['poly'], 'degree': [4, 5, 6, 7], 'gamma': ['scale'], 'coef0': [-1, 0, 1], 'C': [0.01, 0.1, 1, 5, 10, 20, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [126]:
print('Age Feature Best SVR Params:' + str(SVC_Grid.best_params_))
print('Age Feature Best Score:' + str(SVC_Grid.best_score_))

Age Feature Best SVR Params:{'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
Age Feature Best Score:-134.94492204126803


In [29]:
SVR_Grid_Result = pd.DataFrame(SVR_Grid.cv_results_)



In [30]:
SVR_Grid_Result = SVR_Grid_Result[['params', 'rank_test_score', 'mean_test_score', 'mean_train_score']]

In [31]:
SVR_Grid_Result.sort_values('rank_test_score', axis=0, ascending=True, inplace=True)

In [32]:
SVR_Grid_Result.head(10)

Unnamed: 0,params,rank_test_score,mean_test_score,mean_train_score
6,"{'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}",1,-134.944922,-107.698185
51,"{'C': 5, 'coef0': 1, 'degree': 4, 'gamma': 'sc...",2,-140.355832,-115.903465
29,"{'C': 0.1, 'coef0': 1, 'degree': 6, 'gamma': '...",3,-140.550982,-121.779873
40,"{'C': 1, 'coef0': 1, 'degree': 5, 'gamma': 'sc...",4,-140.609994,-115.669851
5,"{'C': 20, 'gamma': 'scale', 'kernel': 'rbf'}",5,-141.043206,-121.819869
39,"{'C': 1, 'coef0': 1, 'degree': 4, 'gamma': 'sc...",6,-142.650359,-126.811376
63,"{'C': 10, 'coef0': 1, 'degree': 4, 'gamma': 's...",7,-143.192659,-111.607325
4,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",8,-143.945107,-128.716422
28,"{'C': 0.1, 'coef0': 1, 'degree': 5, 'gamma': '...",9,-144.733701,-131.884582
3,"{'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}",10,-147.415259,-135.911984


In [33]:
SVR_Grid_Result.to_csv('SVR_Grid_Result.csv')

In [34]:
SVR_Grid_Best = SVR(kernel = 'rbf', C = 100, gamma = 'scale') 

In [35]:
SVR_Grid_Best.fit(X_train, y_train)

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [36]:
joblib.dump(SVR_Grid_Best, 'SVR_BEST_AGE.pkl')

['SVR_BEST_AGE.pkl']

In [37]:
from sklearn.ensemble import GradientBoostingRegressor

In [45]:
GBR_Grid_param = [{'loss' : ['ls'],
                  'learning_rate' : [0.01, 0.1, 1],
                  'n_estimators' : [100, 150, 200, 300, 350],
                  'criterion': ['friedman_mse'],
                  'max_depth': [5, 6, 7],
                  'max_features': ['auto', 6,7,8]},
                 {'loss' : ['lad'],
                  'learning_rate' : [0.01, 0.1, 1],
                  'n_estimators' : [100, 150, 200, 300, 350],
                  'criterion': ['mae'],
                  'max_depth': [ 5, 6, 7],
                  'max_features': ['auto', 6,7,8]}
                 ]

In [46]:
GBR = GradientBoostingRegressor(random_state = 42)
GBR_grid = GridSearchCV(GBR, GBR_Grid_param, cv = 5, scoring = 'neg_mean_squared_error')

In [47]:
GBR_grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...te=42, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'loss': ['ls'], 'learning_rate': [0.01, 0.1, 1], 'n_estimators': [100, 150, 200, 300, 350], 'criterion': ['friedman_mse'], 'max_depth': [5, 6, 7], 'max_features': ['auto', 6, 7, 8]}, {'loss': ['lad'], 'learning_rate': [0.01, 0.1, 1], 'n_estimators': [100, 150, 200, 300, 350], 'criterion': ['mae'], 'max_depth': [5, 6, 7], 'max_features': ['auto', 6, 7, 8]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_err

In [48]:
GBR_grid_result = pd.DataFrame(GBR_grid.cv_results_)



In [50]:
GBR_grid_result.sort_values('rank_test_score', ascending=True, inplace= True)

In [53]:
GBR_grid_result = GBR_grid_result[['params', 'rank_test_score', 'mean_test_score', 'mean_train_score']]

In [54]:
GBR_grid_result.to_csv('GBR_Grid_Result.csv')

In [55]:
joblib.dump(GBR_grid.best_estimator_, 'GBR_BEST_AGE.pkl')

['GBR_BEST_AGE.pkl']

In [56]:
RF_best_model = joblib.load('RF_BEST_AGE.pkl')
SVR_best_model = joblib.load('SVR_BEST_AGE.pkl')
GBR_best_model = joblib.load('GBR_BEST_AGE.pkl')

In [59]:
X_test.head()

Unnamed: 0,Cabin_letter,C,Q,S,title,SibSp,Fare_0,Fare_1,Fare_2,Fare_3,Fare_4,Pclass_Fare_Category2,RF_Age
5,0,0,1,0,0,0,0,0,1,0,0,5,31.556833
17,0,0,0,1,0,0,0,0,0,1,0,3,32.113511
19,0,1,0,0,1,0,1,0,0,0,0,5,32.09339
26,0,1,0,0,0,0,1,0,0,0,0,5,27.645623
28,0,0,1,0,2,0,0,0,1,0,0,5,22.268607


In [64]:
X_test['RF_Age'] = RF_best_model.predict(X_test[['Cabin_letter', 'C', 'Q', 'S', 'title', 'SibSp', 'Fare_0', 'Fare_1', 'Fare_2', 'Fare_3', 'Fare_4', 'Pclass_Fare_Category2']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [61]:
X_test['SVR_Age'] = SVR_best_model.predict(X_test[['Cabin_letter', 'C', 'Q', 'S', 'title', 'SibSp', 'Fare_0', 'Fare_1', 'Fare_2', 'Fare_3', 'Fare_4', 'Pclass_Fare_Category2']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [63]:
X_test['GBR_Age'] = GBR_best_model.predict(X_test[['Cabin_letter', 'C', 'Q', 'S', 'title', 'SibSp', 'Fare_0', 'Fare_1', 'Fare_2', 'Fare_3', 'Fare_4', 'Pclass_Fare_Category2']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [65]:
X_test.head()

Unnamed: 0,Cabin_letter,C,Q,S,title,SibSp,Fare_0,Fare_1,Fare_2,Fare_3,Fare_4,Pclass_Fare_Category2,RF_Age,SVR_Age,GBR_Age
5,0,0,1,0,0,0,0,0,1,0,0,5,31.556833,29.954672,30.355797
17,0,0,0,1,0,0,0,0,0,1,0,3,32.113511,30.10035,32.134562
19,0,1,0,0,1,0,1,0,0,0,0,5,32.09339,24.129579,31.436754
26,0,1,0,0,0,0,1,0,0,0,0,5,27.645623,23.600305,27.690705
28,0,0,1,0,2,0,0,0,1,0,0,5,22.268607,19.099905,22.658978


In [71]:
X_test.drop('SVR_Age', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [72]:
X_test.head()

Unnamed: 0,Cabin_letter,C,Q,S,title,SibSp,Fare_0,Fare_1,Fare_2,Fare_3,Fare_4,Pclass_Fare_Category2,RF_Age,GBR_Age,Age
5,0,0,1,0,0,0,0,0,1,0,0,5,31.556833,30.355797,28.746285
17,0,0,0,1,0,0,0,0,0,1,0,3,32.113511,32.134562,28.746285
19,0,1,0,0,1,0,1,0,0,0,0,5,32.09339,31.436754,28.746285
26,0,1,0,0,0,0,1,0,0,0,0,5,27.645623,27.690705,28.746285
28,0,0,1,0,2,0,0,0,1,0,0,5,22.268607,22.658978,28.746285


In [77]:
X_test.loc[5,'RF_Age']

31.556832760070854

In [80]:
X_test['Age'] = X_test.apply(lambda x : np.mean([x['RF_Age'], x['GBR_Age']]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [81]:
X_test.head()

Unnamed: 0,Cabin_letter,C,Q,S,title,SibSp,Fare_0,Fare_1,Fare_2,Fare_3,Fare_4,Pclass_Fare_Category2,RF_Age,GBR_Age,Age
5,0,0,1,0,0,0,0,0,1,0,0,5,31.556833,30.355797,30.956315
17,0,0,0,1,0,0,0,0,0,1,0,3,32.113511,32.134562,32.124037
19,0,1,0,0,1,0,1,0,0,0,0,5,32.09339,31.436754,31.765072
26,0,1,0,0,0,0,1,0,0,0,0,5,27.645623,27.690705,27.668164
28,0,0,1,0,2,0,0,0,1,0,0,5,22.268607,22.658978,22.463793
