In [63]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [64]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [65]:
# wrapper func for data processing
def preprocess(data, train=True):
    # drop useless data
    data = data.drop(['Ticket', 'Cabin', 'Name'], axis=1)

    # fill na data in age with mean
    age_mean = data.Age.mean()
    data['Age'].fillna(value=age_mean, inplace=True)

    # clean Embarked data with default S
    data['Embarked'].fillna(value='S', inplace=True)

    # clean fare data with mean
    fare_mean = data['Fare'].mean()
    data['Fare'].fillna(value=fare_mean, inplace=True)

    # encode string data with numbers
    encoder = LabelEncoder()
    data['Embarked'] = encoder.fit_transform(data.Embarked)
    data['Sex'] = encoder.fit_transform(data.Sex)

    if train:
        Y = data['Survived'].values
        X = data.drop(["Survived", "PassengerId"], axis=1) #drop these datapoints, not useful

        return X, Y
    else:
        X = data.drop(["PassengerId"], axis=1) #drop these datapoints, not useful

        return X

In [66]:
# preprocess train and test data
X, Y = preprocess(train)
X_test = preprocess(test, train=False)

In [67]:
# split and train
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2)

clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5
)

clf.fit(X_train, Y_train)

pred = clf.predict(X_val)
accuracy_score(Y_val, pred)

0.8156424581005587

In [68]:
# gen submissions from test data
pred = clf.predict(X_test)
df = pd.DataFrame(
    {
        "PassengerId": test.PassengerId.values,
        "Survived": pred
    }
)

In [69]:
df.to_csv("submissions.csv", index=False)