# Import libraries

In [73]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Load the files

In [74]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Define a title-correcting function

In [75]:
def title_corr(t):
    newt = t
    if t == 'Mrs' or t == 'Mr' or t == 'Miss':
        return newt
    elif t == 'Capt' or t == 'Col' or t == 'Major' or t == 'Dr' or t == 'Rev':
        newt = 'Crew'
    elif t == 'Jonkheer' or t == 'Sir' or t == 'the Countess' or t == 'Lady' or t == 'Master':
        newt = 'Noble'
    elif t == 'Don':
        newt = 'Mr'
    elif t == 'Dona' or t == 'Ms' or t == 'Mme':
        newt = 'Mrs'
    elif t == 'Mlle':
        newt = 'Miss'
    else: print('Title not included', t)
    return newt

# apply the function

In [76]:
train_data.insert(3, 'Titles', 'Empty')
titles = list()

for name in train_data['Name']:
    titles.append(name.split(',')[1].split('.')[0].strip())
for i in range(len(titles)):
    titles[i] = title_corr(titles[i])
    
train_data['Titles'] = titles

test_data.insert(3, 'Titles', 'Empty')
test_titles = list()

# names doesnt affect the result but we still can guess from their titles!
for name in test_data['Name']:
    test_titles.append(name.split(',')[1].split('.')[0].strip())
for i in range(len(test_titles)):
    test_titles[i] = title_corr(test_titles[i])
test_data['Titles'] = test_titles


In [63]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [77]:
# we can see that mean fare and 75% quantile are very close, so we can use mean to fill the blank spaces in fare column
train_data['Fare'] = train_data['Fare'].fillna(train_data['Fare'].mean())
test_data['Fare'] = test_data['Fare'].fillna(train_data['Fare'].mean())

In [78]:
# in the age column we see abnormalities like min age 0.42, probably misspelling, so we multiply those ages by 100
train_data['Age'] = train_data['Age'].apply(lambda x: x*100 if x < 1 else x)
test_data['Age'] = test_data['Age'].apply(lambda x: x*100 if x < 1 else x)

# create a function for average age to fill the blank spaces

In [79]:
# in the age column mean is not close to 75% quantile, so we use median instead
def calc_age(df, cl, sx, tl):
    # we use other features like class, sex and title to guess the median age for that group
    a = df.groupby(['Pclass','Sex','Titles'])['Age'].median()
    return a[cl][sx][tl]

#for train data
for i, row in train_data.iterrows():
    if pd.isna(row['Age']):
        newage = (calc_age(train_data, row['Pclass'], row['Sex'], row['Titles']))
        train_data.at[i, 'Age'] = newage
    else:
        continue
        
# for test data       
for i, row in test_data.iterrows():
    if pd.isna(row['Age']):
        newage = (calc_age(test_data, row['Pclass'], row['Sex'], row['Titles']))
        test_data.at[i, 'Age'] = newage
    else:
        continue

In [80]:
# we join the sibling/spouse column with parent/children group to make it tidier
train_data["Family"] = train_data["SibSp"] + train_data["Parch"]
test_data["Family"] = test_data["SibSp"] + test_data["Parch"]

In [81]:
# we use get_dummies function to create dummy variables for categorical values
y = train_data["Survived"]
features = ["Pclass", "Sex", "Family", "Fare", "Titles", "Age", "Embarked"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

# create a model, fit and predict!

In [84]:
model = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
model.fit(X, y)
my_predictions = model.predict(X_test)

In [85]:
# lastly, we save our data

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': my_predictions})
output.to_csv('submission_file.csv', index=False)