In [None]:
import pandas as pd
from io import StringIO

In [None]:
def read_data_from_zip(file='titanic.zip', dataset='train'):
    from zipfile import ZipFile
    with ZipFile(file, 'r') as zip:
        train = zip.read(dataset+'.csv').decode('utf-8')
    df_train = pd.read_csv(StringIO(train))
    return df_train

In [None]:
df_train = read_data_from_zip()

assert len(df_train) == 891
assert all(col in df_train.columns 
          for col in ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'])

fem = df_train.loc[df_train.Sex == 'female']['Survived']
survival_rate = sum(fem)/len(fem)
assert abs(survival_rate - 0.74) < 0.03

In [None]:
def preprocess_data(dataframe, features=None):
    from sklearn.preprocessing import LabelEncoder
    if features is None:
        features = list(dataframe.columns)
    df = dataframe[features]
    df = df.loc[df['Age'].notnull()]
    label_enc = LabelEncoder()
    df['Sex'] = label_enc.fit_transform(df['Sex'])
    df['Embarked'] = label_enc.fit_transform(df['Embarked'])
    return df
    

In [None]:
features = ['Sex', 'Age', 'SibSp', 'Parch', 'Pclass','Fare', 'Embarked']
df_train_clean = preprocess_data(df_train, features=features+['Survived'])
assert len(df_train_clean) == 714

In [None]:
def train_model(data, features):
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    X = data[features]
    Y = data['Survived']
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    return model, accuracy
    

In [None]:
model, accuracy = train_model(df_train_clean, features)

assert abs(accuracy - 0.78) < 0.03

In [None]:

passenger = df_train_clean[features].loc[0]
survival = model.predict([passenger])
expected = df_train.loc[0]['Survived']
assert expected == survival[0]