In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

# Polyfit - Estimating Ages via Polynomial Fit

We try to polynomial interpolate the ages using other data. The function `titanic.polyfit.polyfit()` implements the code written in this notebook.

See the `polyfit()` documentation for more information.

In this notebook we will demonstrate attempting to fit the `Age` data using the `Fare` and `Pclass` data with a degree four polynomial. We will use both the training and test data to create the model since, in this example, we won't use `Survived` for the prediction. Note that there is some missing `Fare` data that we need to predict beforehand. (We will just fill with fillforwards or something.)

In [2]:
from titanic.polyfit import polyfit

#polyfit?

In [3]:
# concatenate the training and test data. note the single missing
# fare information
#
df_train = pd.read_csv("../input/train.csv", dtype={"Age": np.float64})
df_test = pd.read_csv("../input/test.csv", dtype={"Age": np.float64}, )
df = pd.concat([df_train, df_test])

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


In [4]:
# fill the missing fare
#
df['Fare'].fillna(method='pad', inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1309 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


In [5]:
%pdb 1
# call polyfit
#

xlabels = ['Fare', 'Pclass']
ylabel = 'Age'
deg = 4
df_filled = polyfit(df, xlabels, ylabel, deg, inplace=False)

df_filled.info()

Automatic pdb calling has been turned ON
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1309 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1309 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1309 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


# Notebook Version

In [7]:
print 'Concatenated data:', df.shape

known_ages = df[pd.notnull(df.Age)]
unknown_ages = df[pd.isnull(df.Age)]
print 'Known ages:       ', known_ages.shape
print 'Unknown ages:     ', unknown_ages.shape

Concatenated data: (1309, 12)
Known ages:        (1046, 12)
Unknown ages:      (263, 12)


In [9]:
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.linear_model import LinearRegression

# WARNING: there is actually one entry with a NaN fare
#
known_ages.fillna(method='pad', inplace=True)

# extract the desired features and scale (there are some scale issues)
#
features = ['Fare', 'Pclass']
X_known = known_ages.loc[:,features].as_matrix()
Y_known = known_ages.loc[:,'Age']
Y_known = Y_known.reshape(-1,1)
X_unknown = unknown_ages.loc[:,features].as_matrix()

# PolynomialFeatures - computes all monomials required for
# requested degree using X-data and stores result in matrix.
# used for linear regression
#
poly = PolynomialFeatures(3)
_X_known = poly.fit_transform(X_known)
_X_unknown = poly.fit_transform(X_unknown)

# the model is now linear in the monomials. fit and predict
#
clf = LinearRegression()
clf.fit(_X_known, Y_known)
Y_unknown = clf.predict(_X_unknown)

print 'Score:', clf.score(_X_known, Y_known)

Score: 0.219170622732
