# Kaggle

## Import Libraries

In [515]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## Import Dataset

In [516]:
train_dataset = pd.read_csv("data/titanic_train.csv")

# PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
important_columns = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "Cabin"]
int_columns = [0, 2, 3, 4, 5]

X = train_dataset[important_columns].values
y = train_dataset["Survived"].values

## Data Preprocessing

### Account for Missing Data

In [517]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="median")
imputer.fit(X[:, int_columns])

# process int rows with imputer
X[:, int_columns] = imputer.transform(X[:, int_columns])

# replace cabins with numeric representations of having a cabin
X[:, -1] = [(0 if pd.isnull(cell) else len(cell.split(' '))) for cell in X[:, -1]]

# drop rows with NaN in column
X_embarked = pd.isnull(X[:, -2])
X = X[~X_embarked]
y = y[~X_embarked]

### Encode Categorical Data

In [518]:
def encode_labels(column):
    map = set(column)
    return dict(zip(map, range(len(map))))

In [519]:
embarked_map = encode_labels(X[:, -2])
sex_map = encode_labels(X[:, 1])

In [520]:
X[:, -2] = [embarked_map.get(cell) for cell in X[:, -2]]
X[:, 1] = [sex_map.get(cell) for cell in X[:, 1]]

### Split Dataset

In [521]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Feature Scaling

In [522]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Train Model

In [523]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

### Make Predictions

In [524]:
y_pred = classifier.predict(X_test)

In [525]:
from sklearn.metrics import accuracy_score, confusion_matrix

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.7752808988764045
[[85 24]
 [16 53]]
