In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import seaborn as sns

In [3]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

In [100]:
titanic_train = pd.read_csv('data/train.csv')

In [101]:
titanic_train.shape

(891, 12)

## Functions
***

In [102]:
def fill_nulls(df, min_limit=0.20, max_limit=0.75):
    # Create empty dictionary to enter column names, null percentage, and filled status
    result = dict()
    
    # Get column name and null percentage from 'df' argument if null percentage exists
    null_cols = [[col, round(df[col].isnull().sum()/df.shape[0],3)]
                 for col in df.columns
                 if df[col].isnull().sum() > 0]
    
    for col in null_cols:
        # If null percentage higher than/equal to max limit, drop column from df
        if col[1] >= max_limit:
            result[col[0]] = {'Filled': False, 
                              'Dropped':True, 
                              'Null_Percentage': col[1]}
            df.drop(columns=col[0], inplace=True)
            
        # If null percentage higher than/equal to min_limit argument, enter into dictionary w/ 'Filled':False
        elif col[1] >= min_limit:    
            result[col[0]] = {'Filled': False, 
                              'Dropped':False, 
                              'Null_Percentage': col[1]}
            
        # If null percentage lower than limit argument, enter into dictionary w/ 'Filled':True, filled type & value
        elif col[1] < min_limit: 
            if df[col[0]].dtypes in ['float', 'int']:
                df[col[0]].fillna(df[col[0]].dropna().median(), inplace=True)
                result[col[0]] = {'Filled': True, 
                                  'Type': 'Median', 
                                  'Fill_Value':df[col[0]].dropna().median(), 
                                  'Null_Percentage': col[1]}
            else:
                df[col[0]].fillna(df[col[0]].dropna().mode()[0], inplace=True)
                result[col[0]] = {'Filled': True, 
                                  'Type': 'Mode', 
                                  'Fill_Value':df[col[0]].dropna().mode()[0], 
                                  'Null_Percentage': col[1]}
                
    # Return dictionary result (although could be pandas dataframe...?)        
    return result

In [103]:
def name_to_title(df):
    # Use list of unique titles in Name column to determine accepted titles
    accepted_titles = {'Mr':0, 'Master':1, 'Sir'2:, 'Mrs'3:, 'Ms':4, 'Lady':5}
    
    # Get titles for each row
    raw_titles = df['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
    
    # Search for and replace unaccepted titles based on gender
    for idx, title in enumerate(raw_titles):
        if title not in accepted_titles:
            if df.iloc[idx]['Sex'] == 0:
                raw_titles[idx] = 'Ms'
            else:
                raw_titles[idx] = 'Mr'
                
    # Create Title column and drop Name column
    df['Title'] = raw_titles
    df.drop(columns='Name', inplace=True)
    
    # Return new, cleaned df
    return df

## EDA / Cleaning
***
- Age, Cabin, and Embarked contain NaN values
    - Use function to fill NaN values below specified percentage (if NaN percentage is below .2, fill; if above .75, remove column)
        - Age (177) - 20%
        - Cabin (687) - 77%
        - Embarked (2) - 0.2%
    - Age and Embarked filled; Cabin removed
- Create dummy variables with Embarked
- Change Sex to 1/0
- Possibly remove PassengerId and Ticket
- Change Name to Titles for simpler predictions

In [104]:
print('Null Value Count:\n--------------\n', titanic_train.isnull().sum())
print('_______________________\n')
print('DTypes:\n--------------\n', titanic_train.dtypes)

Null Value Count:
--------------
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
_______________________

DTypes:
--------------
 PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [105]:
# Fill null values if null percentage is below limit (here, limit is .2)
# Could go into more detail with Name column, but filling null values with median value is acceptable
autofill_null_vals = fill_nulls(titanic_train)

In [106]:
for k, v in autofill_null_vals.items():
    print(k, v)

Age {'Filled': True, 'Type': 'Median', 'Fill_Value': 28.0, 'Null_Percentage': 0.199}
Cabin {'Filled': False, 'Dropped': True, 'Null_Percentage': 0.771}
Embarked {'Filled': True, 'Type': 'Mode', 'Fill_Value': 'S', 'Null_Percentage': 0.002}


In [107]:
# Turn Sex into bool values, and turn Embarked into dummy variables
titanic_train['Sex'] = titanic_train['Sex'].map(lambda x: 0 if x == 'female' else 1)

embarked_dummies = pd.get_dummies(titanic_train['Embarked'], drop_first=True, prefix='Embarked_')
titanic_train = pd.concat([titanic_train, embarked_dummies], axis=1)

In [108]:
# Drop unnecessary columns - ['PassengerId', 'Ticket', 'Embarked']
titanic_train.drop(columns=['PassengerId', 'Ticket', 'Embarked'], inplace=True)

In [109]:
titanic_train.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked__Q,Embarked__S
0,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.25,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,0,0
2,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,7.925,0,1


In [110]:
titanic_clean = name_to_title(titanic_train)

In [111]:
titanic_clean['Title'].unique()

array(['Mr', 'Mrs', 'Ms', 'Master', 'Lady', 'Sir'], dtype=object)