In [2]:
import pandas as pd
import numpy as np

from os import sep, makedirs
from os.path import dirname, realpath, join, isdir

uppath = lambda _path, n: sep.join(_path.split(sep)[:-n]) # moves the path 'n' levels up the directory

__file__ = dirname(realpath('__file__'))
data_parent_directory = uppath(__file__, 2)
data_directory = join(data_parent_directory, 'Data')
titanic_directory = join(data_directory, 'titanic')

Exracts the csv files from the 'titanic.zip' file and stores it in the 'titanic' directory

In [3]:
from zipfile import ZipFile

def extract_titanic_data(zipfile_directory, titanic_directory):
    
    if not isdir(titanic_directory):
        makedirs(titanic_directory)
    
    titanic_path = join(zipfile_directory, 'titanic.zip')
    with ZipFile(titanic_path, 'r') as zip:
            zip.extractall(path=titanic_directory)

Loads the extracted titanic data into two Pandas DFs: train and test

In [4]:
def load_titanic_data(titanic_directory):
    train_csv = join(titanic_directory, "train.csv")
    test_csv = join(titanic_directory, "test.csv")
    return pd.read_csv(train_csv), pd.read_csv(test_csv)

In [5]:
extract_titanic_data(data_directory, titanic_directory)

train, test = load_titanic_data(titanic_directory)

In [6]:
from collections import Counter

def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   

# detect outliers from Age, SibSp , Parch and Fare
Outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])

In [7]:
train.loc[Outliers_to_drop]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S
159,160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S
180,181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S
201,202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S
792,793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S
324,325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S
846,847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S
341,342,1,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0,C23 C25 C27,S
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S
88,89,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,C23 C25 C27,S


In [8]:
train = train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)

In [9]:
trainSize = train['Survived'].value_counts()[1] / train['Survived'].value_counts()[0]  
trainSize

0.6284658040665434

In [10]:
train['Survived'].value_counts()
train[train['Survived'] == 0]['Survived'].value_counts()

0    541
Name: Survived, dtype: int64

In [11]:
train['Survived']

0      0
1      1
2      1
3      1
4      0
5      0
6      0
7      0
8      1
9      1
10     1
11     1
12     0
13     0
14     0
15     1
16     0
17     1
18     0
19     1
20     0
21     1
22     1
23     1
24     0
25     1
26     0
27     1
28     0
29     0
      ..
851    0
852    0
853    1
854    0
855    1
856    1
857    0
858    0
859    1
860    0
861    1
862    0
863    0
864    1
865    1
866    0
867    0
868    0
869    1
870    1
871    0
872    0
873    0
874    0
875    0
876    0
877    1
878    0
879    1
880    0
Name: Survived, Length: 881, dtype: int64

In [12]:
test.shape

(418, 11)

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 12 columns):
PassengerId    881 non-null int64
Survived       881 non-null int64
Pclass         881 non-null int64
Name           881 non-null object
Sex            881 non-null object
Age            711 non-null float64
SibSp          881 non-null int64
Parch          881 non-null int64
Ticket         881 non-null object
Fare           881 non-null float64
Cabin          201 non-null object
Embarked       879 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 82.7+ KB


There seem to be missing values in Cabin, Age and Embarked.  Age can be predicted with SibSp, Parch, Fare.

In [14]:
train.corr().Age

PassengerId    0.034172
Survived      -0.076867
Pclass        -0.374495
Age            1.000000
SibSp         -0.307129
Parch         -0.186457
Fare           0.110219
Name: Age, dtype: float64

---------------------------------------------------------------------------------------------------------------------------
Pclass and SibSp seem to have a weak correlation with Age. I'll nevertheless use them to try and predict it. I also want to consider the effects of Parch, although the data may suggest otherwise. 

#### So here's how I'll do it:
I'll convert the numerical Age attribute into categorical: Young, middle-aged, old.
Then I'll try to predict this new catgeory with the ones mentioned before.

In [15]:
import re

# Extracts the part of the name between the comma (,) and the full-stop (.)
def extract_title(x):
    title = re.search('(.*), (.*?)\.(.*)', x)
    return title.group(2)

train['Title']  = train['Name'].apply(lambda x: extract_title(x))
test['Title'] = test['Name'].apply(lambda x: extract_title(x))

In [16]:
train['Title'].value_counts()

Mr              513
Miss            177
Mrs             125
Master           39
Dr                7
Rev               6
Major             2
Col               2
Mlle              2
Mme               1
Jonkheer          1
Ms                1
Capt              1
Lady              1
Don               1
the Countess      1
Sir               1
Name: Title, dtype: int64

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin

def get_age_category(x):
    if np.isnan(x):
        return x
    if x <=18:
        return 1
    elif x > 18 and x <= 50:
        return 2
    else:
        return 3
    
train['Age_Category'] = train['Age'].apply(lambda x:  get_age_category(x))
test['Age_Category'] = test['Age'].apply(lambda x: get_age_category(x))

def get_name_category(x):
    if x == 'Mr':
        return 0
    elif x == 'Miss':
        return 1
    elif x == 'Mrs':
        return 2
    elif x == 'Master':
        return 3
    elif type(x) == str:
        return 4
    else:
        return 5

train['Name_Category'] = train['Title'].apply(lambda x:  get_name_category(x))
test['Name_Category'] = test['Title'].apply(lambda x: get_name_category(x))


#cats = {'a':1, 'p':2, 's':3, 'c':3, '1':2, '2':4, '3':5}
cats = {'a':1, 'p':2, 's':1, 'c':1, '1':2, '2':1, '3':3}

train['Ticket_Category'] = train['Ticket'].apply(lambda x: cats[x.lower()[0]] if x.lower()[0] in cats.keys() else 3)
test['Ticket_Category'] = test['Ticket'].apply(lambda x: cats[x.lower()[0]] if x.lower()[0] in cats.keys() else 3)

train['Ticket_Category'] = train['Ticket'].apply(lambda x: 1 if x.lower()[0]  == 'a' else 0)
test['Ticket_Category'] = test['Ticket'].apply(lambda x: 1 if x.lower()[0] == 'a' else 0)

In [18]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Age_Category,Name_Category,Ticket_Category
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,2.0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,2.0,2,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,2.0,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,2.0,2,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,2.0,0,0


Loading non-null Age data.

In [23]:
age_not_null = train[~np.isnan(train['Age'])]

age_null = train[np.isnan(train['Age'])]

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit


split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(age_not_null, age_not_null["Age_Category"]):
    strat_train_set = age_not_null.iloc[train_index]
    strat_test_set = age_not_null.iloc[test_index]

predictors = ['Pclass', 'Name_Category']
targets = ['Age_Category']


age_train_pred = prepare_age_data(strat_train_set[predictors])
age_train_tar = strat_train_set[targets]

age_test_pred = prepare_age_data(strat_test_set[predictors])
age_test_tar = strat_test_set[targets]

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)
#forest_clf =  GradientBoostingClassifier(subsample = 0.75, random_state=42)
scores = cross_val_score(forest_clf, age_train_pred, age_train_tar, cv=3, scoring="accuracy")
scores.mean()

'''The Accuracy is low. But that is expected, considering the low correlation. I'll anyway continue with the imputation and see how it goes. But it sure needs improvement.'''

forest_clf.fit(age_train_pred, age_train_tar)
result = forest_clf.predict(age_test_pred)

accuracy_score(age_test_tar, result)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


0.7762237762237763

In [24]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler


def prepare_age_data(X):
    
    encoded = pd.get_dummies(X, columns=[ 'Name_Category'])
    for i in range(1, 6):
        if 'Name_Category_'+str(i) not in encoded.columns:
            encoded['Name_Category_'+str(i)] = 0
    
    return encoded


def predict_age_category(X):
    predictors = ['Pclass', 'Name_Category']
    targets = ['Age_Category']

    age_not_null = train[~np.isnan(train['Age_Category'])]
    age_train_pred = prepare_age_data(age_not_null[predictors])
    age_train_tar = age_not_null[targets]
    
    forest_clf = RandomForestClassifier(n_estimators=10, random_state=42, bootstrap=False)
    forest_clf.fit(age_train_pred, age_train_tar)
    
    age_null = X[np.isnan(X['Age_Category'])]
    age_test_pred = prepare_age_data(age_null[predictors])
    
    predictions = forest_clf.predict(age_test_pred)
    X.drop(['Name_Category', 'Pclass'], axis=1, inplace=True)
    
    return predictions 


class Age_Imputer(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):

        age_result_category = predict_age_category(X)
        #X.drop('Fare', axis=1, inplace=True)

        young_median = X[X['Age'] <= 18].median().Age
        middle_aged_median = X[X['Age'] > 18][X['Age'] <= 50].median().Age
        old_median = X[X['Age'] > 50].median().Age

        age_map = {1:young_median, 2:middle_aged_median, 3:old_median}

        result_pos = -1
        for index, row in X.iterrows():
            if np.isnan(row['Age']):
                result_pos += 1
                X.at[index, 'Age'] = age_map[age_result_category[result_pos]]
        return X
    
class Age_Category_Imputer(BaseEstimator, TransformerMixin):
        def __init__(self): # no *args or **kargs
            pass
        def fit(self, X, y=None):
            return self  # nothing else to do
        def transform(self, X, y=None):

            age_result_category = predict_age_category(X)
            #X.drop(['Fare', 'Parch'], axis=1, inplace=True)
        
            result_pos = -1
            for index, row in X.iterrows():
                if np.isnan(row['Age_Category']):
                    result_pos += 1
                    X.at[index, 'Age_Category'] = age_result_category[result_pos]
            return X
        
class Child_Or_Old(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        
        age_result_category = predict_age_category(X)
            
        result_pos = -1
        for index, row in X.iterrows():
            if np.isnan(row['Age_Category']):
                result_pos += 1
                X.at[index, 'Age_Category'] = age_result_category[result_pos]
                
        X['Child_Or_Old'] = X['Age_Category'].apply(lambda x: 0 if x == 2 else 1)
        X.drop(['Age_Category'], axis=1, inplace=True)
        return X

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class Embarked_Imputer(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        
        embarked_mode = X['Embarked'].mode()[0]
        for index, row in X.iterrows():
            if not isinstance(row['Embarked'], str):
                X.at[index, 'Embarked'] = embarked_mode
        X['Embarked'] = X['Embarked'].apply(lambda x: 1 if x == 'S' else 0)
        return X


class Parch_Transform(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        X['Parch_Transformed'] = X['Parch'].apply(lambda x: 1 if x == 0 else 0)
        X.drop('Parch', inplace=True, axis=1)
        return X
        
        
class SibSp_Transform(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        X['SibSp_Transformed'] = X['SibSp'].apply(lambda x: 1 if x == 0 else 0)
        X.drop('SibSp', inplace=True, axis=1)
        return X

class SibSp_Parch_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        X['Parch_SibSp_Transformed'] = X[['Parch', 'SibSp']].apply(lambda x: 1 if x['SibSp'] == 0 and x['Parch'] == 0 else 0, axis=1)
        X.drop(['SibSp', 'Parch'], inplace=True, axis=1)
        return X

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    #('sibsp_transformer', SibSp_Transform()),
    #('parch_transformer', Parch_Transform()),
    #('sibsp_parch_transformer', SibSp_Parch_Transformer()),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ("embarked_imputer", Embarked_Imputer()),
    ("age_category_imputer", Age_Category_Imputer()),
    ("one_hot_encoding", OneHotEncoder())
])

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

age_predictors = [ 'Pclass', 'Name_Category']
num_attribs = [ 'Family_Size']
cat_attribs = ['Embarked', 'Sex', 'Age_Category', 'Ticket_Category', 'Cabin_Category']
cat_total_attribs = cat_attribs + age_predictors                   # Age_Category, Name_Category, Fare, Parch and Sex are used 
                                                                   # only for creating Child_Or_Old and Female columns

full_pipeline = ColumnTransformer([
        ("categorical", cat_pipeline, cat_total_attribs),
        ("numerical", num_pipeline, num_attribs)
    ])

In [39]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

age_not_null = train[~np.isnan(train['Age'])]

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(age_not_null, age_not_null['Age_Category']):
    not_null_train = train.iloc[train_index]
    not_null_test = train.iloc[test_index]
    
age_null = train[np.isnan(train['Age'])]
null_train, null_test = train_test_split(age_null, test_size=0.2, shuffle=True, random_state=42)

total_train = [not_null_train, null_train]
total_test = [not_null_test, null_test]

strat_train_set = pd.concat(total_train, ignore_index=True)
strat_test_set = pd.concat(total_test, ignore_index=True)

def survived_category_equalizer(train_data):
    
    trainSize = train_data['Survived'].value_counts()[1] / train_data['Survived'].value_counts()[0]
    
    trainSplit = StratifiedShuffleSplit(n_splits=1, train_size=trainSize, random_state=42)
    for train_index, test_index in split.split(train_data[train['Survived'] == 0], train_data[train_data['Survived'] == 0][['Age_Category','SibSp','Parch','Fare']]):
        updated_train_data = train_data.iloc[train_index]
    
    return updated_train_data
    

# train_prepared = full_pipeline.fit_transform(strat_train_set[num_attribs + cat_attribs + age_predictors])
# train_prepared = survived_category_equalizer(train_prepared)
train_labels = strat_train_set['Survived'].copy()

test_prepared = full_pipeline.fit_transform(strat_test_set[num_attribs + cat_attribs + age_predictors])
test_labels = strat_test_set['Survived'].copy()



In [40]:
len(not_null_test)

143

In [41]:
from sklearn.ensemble import RandomForestClassifier

final_train_prepared = full_pipeline.fit_transform(train[num_attribs + cat_attribs + age_predictors])
final_test_prepared = full_pipeline.fit_transform(test[num_attribs + cat_attribs + age_predictors])
final_train_labels = train['Survived'].copy()

forest_clf = RandomForestClassifier(n_estimators=10, random_state=42, oob_score=True) 
forest_clf.fit(final_train_prepared, final_train_labels)
final_result = forest_clf.predict(final_test_prepared)
submission_data = {'PassengerId': test['PassengerId'], 'Survived': final_result}
submission = pd.DataFrame(data=submission_data)

submission.to_csv(path_or_buf=join(titanic_directory, "submission.csv"), index=False)

submission.head()

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [42]:
submission['Survived'].value_counts()

0    260
1    158
Name: Survived, dtype: int64

Testing accuracy with train_test_split

In [43]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier
from sklearn.svm import SVC

forest_clf = RandomForestClassifier(n_estimators=10, random_state=42, oob_score=True) 
forest_clf.fit(final_train_prepared, final_train_labels)
final_result = forest_clf.predict(final_test_prepared)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


In [46]:
forest_clf.feature_importances_
forest_clf.oob_score_

0.7922814982973894

# Further Data Analysis

In [None]:
# print(train[train['Embarked'] == 'S'].SibSp.value_counts())
# print(train[train['Embarked'] == 'C'].SibSp.value_counts())
# print(train[train['Embarked'] == 'Q'].SibSp.value_counts())

counts = {}
count = 0
non_null = 0
for index, row in train.iterrows():
    if isinstance(row['Cabin'], str):
        non_null += 1
        count = len(row['Cabin'].split(' '))
        if count in counts.keys():
            counts[count] += 1
        else:
            counts[count] = 1
print(count, non_null, counts)

In [None]:
train['Cabin'].count()

In [29]:
def get_cabin_category(x):
    if isinstance(x, str):
        x = x[0]
        if x == 'A' or x == 'G' or x == 'T':
            return 1
        else:
            return 2
    else:
        return 3


train['Cabin_Category'] = train['Cabin'].apply(lambda x: get_cabin_category(x))
test['Cabin_Category'] = test['Cabin'].apply(lambda x: get_cabin_category(x))

In [30]:
train['Cabin_Category'].value_counts()

3    680
2    181
1     20
Name: Cabin_Category, dtype: int64

In [32]:
train['Family_Size'] = train[['Parch', 'SibSp']].apply(lambda x:  x['SibSp'] + x['Parch'] + 1, axis=1)
test['Family_Size'] = test[['Parch', 'SibSp']].apply(lambda x:  x['SibSp'] + x['Parch'] + 1, axis=1)