In [1]:
import numpy as np
import pandas as pd

### get data

In [2]:
train_data = pd.read_csv('../data/train.csv')
X_test = pd.read_csv('../data/test.csv')
print(X_test.head())

# train data
y = train_data["Survived"]
X = train_data.drop("Survived", axis=1)

datasets = [X,X_test]

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  


#### preprocessing data ####

In [3]:
for df in datasets:
    df.set_index("PassengerId", inplace=True)
    # ticket is useless and cabin has too many missing values 
    df.drop(["Ticket", "Cabin"], axis=1, inplace=True)
    
    #encoding
    sex_encoder = {'male':0, 'female':1}
    df['Sex'] = df['Sex'].map(sex_encoder).astype(int)
    
    embarked_encoder = {'S': 1, 'C':'2', 'Q':3}
    df["Embarked"] = (df["Embarked"]
        .map(embarked_encoder)
        .fillna(2)
        .astype(int)
    )
    
    # handle missing values
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)

#### data engineering ####

In [4]:
# extract title from the name
for df in datasets:
    rare_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    title_encoder = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    
    df['Title'] = (df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
        .replace(rare_titles, 'rare')
        .replace('Mlle', 'Miss')
        .replace('Mlle', 'Miss')
        .replace('Mlle', 'Miss')
        .map(title_encoder)
        .fillna(0)
        .astype(int)
    )
    
    df.drop('Name', axis=1, inplace=True)    

In [5]:
# discretize age

# df.loc[row_condition, column_label]    

for df in datasets:
    df.loc[ df['Age'] <= 16, 'Age'] = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
    df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
    df.loc[ df['Age'] > 64, 'Age'] = 4
    
    df['Age'] = df['Age'].astype(int)

In [6]:
# Family Size
# Single people and large peoples (>4) have less chances to survive

for df in datasets:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    df['IsSingle'] = 0
    df.loc[df['FamilySize'] == 1, 'IsSingle'] = 1
    
    df.drop(['FamilySize', 'SibSp', 'Parch'], axis=1, inplace=True)
    


In [7]:
# discretize Fare

for dataset in datasets:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

In [8]:
print(datasets[0].head())
datasets[0]['Age'].to_csv('../data/X.csv')

             Pclass  Sex  Age  Fare  Embarked  Title  IsSingle
PassengerId                                                   
1                 3    0    1     0         1      1         0
2                 1    1    2     3         2      3         0
3                 3    1    1     1         1      2         1
4                 1    1    2     3         1      3         0
5                 3    0    2     1         1      1         1


In [9]:
X = datasets[0]
X_test = datasets[1]
print(X_test.head())

             Pclass  Sex  Age  Fare  Embarked  Title  IsSingle
PassengerId                                                   
892               3    0    2     0         3      1         1
893               3    1    2     0         1      3         0
894               2    0    3     1         3      1         1
895               3    0    1     1         1      1         1
896               3    1    1     1         1      3         0


### model and prediction

In [10]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(X,y)
pred = model.predict(X_test)

# save prediction
pred_df = pd.DataFrame(pred, index=X_test.index, columns=["Survived"])
pred_df.to_csv("../predictions/randomForest4.csv")

