<a href="https://colab.research.google.com/github/paton838/works/blob/master/data-science/titanic/titanic_clean_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Source: https://www.kaggle.com/c/titanic

In [0]:
import pandas as pd
import numpy as np
import re as re

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score

In [0]:
# read in the data
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
TestPassengerId = test['PassengerId']

In [0]:
# data cleaning: for both train & test dataset =======================
full_data = [train, test]

In [0]:
# Companion should have an effect on survival rate
# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['Companion'] = dataset['SibSp'] + dataset['Parch']

In [0]:
# Ability to get helps from family should have an effect on survival rate
# Create new feature IsAlone from Companion
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['Companion'] == 0, 'IsAlone'] = 1

In [0]:
# Fill NaN values in Embarked to S
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [0]:
median_fare = (pd.DataFrame(full_data[0]))['Fare'].median()
median_fare

# Fill NaN values in Fare with median values
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(median_fare)

In [0]:
# Create a New feature CategoricalAge
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

In [0]:
# change the Title
# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

# Create a new feature Title, containing the titles of passenger names
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

In [0]:
# find the special people
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [0]:
# Encoding our features
for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 10, 'Fare']                             = 0
    dataset.loc[(dataset['Fare'] > 10) & (dataset['Fare'] <= 20), 'Fare']   = 1
    dataset.loc[(dataset['Fare'] > 20) & (dataset['Fare'] <= 30), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 30, 'Fare']                              = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age']                          = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 35), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 35) & (dataset['Age'] <= 50), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 50) & (dataset['Age'] <= 65), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 65, 'Age']

In [0]:
# Feature Selection
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp','Parch']
train = train.drop(drop_elements, axis=1)
test  = test.drop(drop_elements, axis=1)

In [138]:
# get some sample to ensure the df makes sense
train.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Companion,IsAlone,Title
355,0,3,1,1,0,0,0,1,1
634,0,3,0,0,2,0,5,0,2
13,0,3,1,2,3,0,6,0,1
869,1,3,1,0,1,0,2,0,4
777,1,3,0,0,1,0,0,1,2


In [0]:
x_train = train[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Companion', 'IsAlone', 'Title']]
y_train = train['Survived'].ravel()

In [140]:
# model =========================
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
import math

gbm = LogisticRegression(random_state=0).fit(x_train, y_train)

my_pipeline = make_pipeline(gbm)
scores = (-cross_val_score(my_pipeline, x_train, y_train, scoring='neg_mean_absolute_error', cv=15))
print('Score of the 15 runs:')
print(scores)

Score of the 15 runs:
[0.21666667 0.21666667 0.18333333 0.2        0.26666667 0.13333333
 0.20338983 0.16949153 0.22033898 0.18644068 0.16949153 0.20338983
 0.23728814 0.16949153 0.22033898]


In [141]:
print('Average score:')
print(scores.mean())

Average score:
0.19975517890772126


In [142]:
x_test = test[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Companion', 'IsAlone', 'Title']]

predictions = gbm.predict(x_test)
print(predictions)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 1 1 0 0 1 0 0 1 1 0 1 1 0 1 0 1 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 1 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


In [143]:
# for competition
submission = pd.DataFrame({ 'PassengerId': TestPassengerId, 'Survived': predictions })
print(submission.sample(5))

     PassengerId  Survived
213         1105         1
314         1206         1
165         1057         1
204         1096         0
300         1192         0


In [0]:
# submission.to_csv("submission.csv", index=False)