In [388]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing




In [389]:
train = pd.read_csv('/Users/mk/Downloads/titanic/train.csv')
test = pd.read_csv('/Users/mk/Downloads/titanic/test.csv')

train.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [390]:
def impute_missing(df): 
    #Imputes missing values for Fare, Embarked, Age
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Age'] = df['Age'].astype(int) # trims decimals
    df['Fare'] = df['Fare'].fillna(0)
    df['Fare'] = df['Fare'].astype(int)# trims decimals
    df['Embarked'] = df['Embarked'].fillna('S')

def encode_labels(df):
    #Encodes labels 
    label_encoder = preprocessing.LabelEncoder()
    df['Sex']= label_encoder.fit_transform(df['Sex'])
    df['Embarked']= label_encoder.fit_transform(df['Embarked'])

def calc_fare_per_passenger(df):
    # This will adjust the fair to a more accurate cost per person and not per family. 
    df['Relatives'] = df['SibSp'] + df['Parch'] 
    df['Fare_Per_Passenger'] = df['Fare']/(df['Relatives']+1)
    df['Fare_Per_Passenger'] = df['Fare_Per_Passenger'].astype(int)
    
def log_age_fare(df):
    #smooths Age and Fare using Log
    df['Age'] = np.log1p(df['Age'])        
    df['Fare'] = np.log1p(df['Fare'])  

def drop_features(df):
    #drop passenger id, name, parch, sibsp, cabin
    df.drop(['Name', 'Parch', 'SibSp', 'Ticket', 'Cabin'], axis = 1, inplace = True)




In [391]:
def final(df):
    impute_missing(df)
    encode_labels(df)
    calc_fare_per_passenger(df)
    log_age_fare(df)
    drop_features(df)

In [392]:
final(train)
final(test)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Relatives,Fare_Per_Passenger
0,1,0,3,1,3.135494,2.079442,2,1,3
1,2,1,1,0,3.663562,4.276666,0,1,35
2,3,1,3,0,3.295837,2.079442,2,0,7
3,4,1,1,0,3.583519,3.988984,2,1,26
4,5,0,3,1,3.583519,2.197225,2,0,8


In [393]:
#Gets a baseline - we run the RF model with no hyperparameter tuning and existing current features

X_train = train.drop(['PassengerId','Survived'], axis = 1)

y_train = train['Survived']

X_test = test.drop(['PassengerId'], axis = 1)

baseline_model = RandomForestClassifier(n_estimators = 1000, max_depth = 6, random_state = 42)
baseline_model.fit(X_train, y_train)
predictions = baseline_model.predict(X_test)

output = pd.DataFrame({'Survived': predictions}, index = test.PassengerId.astype(int))
output.to_csv('Titanic_0411_Baseline_Prediction.csv')



In [394]:
# #drops Survived from train to get X
# X = train.drop(['Survived'], axis = 1)

# #Takes Survived column from train to get y
# y = train['Survived']

In [395]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [396]:
baseline_model.feature_importances_

array([0.1113031 , 0.40854762, 0.12523105, 0.13452743, 0.02766215,
       0.07123269, 0.12149596])

In [397]:
#Notes:

#The baseline model above yields the best accuracy when submitted to Kaggle. The hyperparameters from gridsearch result 
#in a slightly lower score. 

#taking log of age and fare, and then including another feature for fare per passenger, yields the best results so far. 

#just having age(log) and then fare yields better results than age(log) and fare per passenger

#age(log), then just having fare per passenger yields worse results

#hyperparameter grid search suggests max_feature = sqrt, but it yields worse or equal results

#taking log of fare_per_passenger has little effect

#adding side lowers score by a small amount

#Removing all features but age sex and fare yields poor results

#mean better than median for age
