### Titanic dataset from https://www.kaggle.com/c/titanic

### Exploratory analysis and machine learning

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


### Load dataset

In [3]:
data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

### Data analysis

In [None]:
data_train.head()

In [None]:
data_train.info()

In [None]:
X_train = data_train.drop(['Survived','PassengerId', 'Name', 'Ticket'], 1)
y_train = data_train['Survived']

X_test = data_test.drop(['PassengerId', 'Name', 'Ticket'], 1)

In [None]:
data_train.plot(y='Age', kind='hist', 
           color='blue',  title='Age distribution', figsize = (7, 5))

In [None]:
plot = data_train.plot(x='Age', y='Fare', kind='scatter', title='scatter plot', figsize = (7, 5))
plot.set_xlabel(u'Age')
plot.set_ylabel(u'Fare')

In [None]:
box = sns.boxplot(data_train['Survived'], data_train['Age'])
box.set_ylabel(u'Age')
box.set_xlabel(u'Survived')

In [None]:
sns.pairplot(data_train.fillna(0))

### Prepare data

In [None]:
numeric_cols = ['Pclass','Age','SibSp','Parch','Fare']
categorical_cols = list(set(X_train.columns.values.tolist()) - set(numeric_cols))

In [None]:
def calculate_means(numeric_data):
    means = np.zeros(numeric_data.shape[1])
    for j in range(numeric_data.shape[1]):
        to_sum = numeric_data.iloc[:,j]
        indices = np.nonzero(~numeric_data.iloc[:,j].isnull())[0]
        correction = np.amax(to_sum[indices])
        to_sum /= correction
        for i in indices:
            means[j] += to_sum[i]
        means[j] /= indices.size
        means[j] *= correction
    return pd.Series(means, numeric_data.columns)

In [None]:
ids_train = range(0, X_train.index[-1] + 1)
ids_test = range(X_train.index[-1] + 1, X_train.index[-1] + X_test.index[-1] + 2)

X_all = pd.concat([X_train, X_test], ignore_index=True)

X_real_mean = X_all[numeric_cols]
X_real_mean = X_real_mean.fillna(calculate_means(X_all[numeric_cols]))

X_cat = X_all[categorical_cols]
X_cat = X_all[categorical_cols].fillna("NA").astype(str)

In [None]:
from sklearn.feature_extraction import DictVectorizer as DV
encoder = DV(sparse = False)
X_cat_oh = encoder.fit_transform(X_cat.T.to_dict().values())

In [None]:
X_all_matrix = np.hstack((X_real_mean, X_cat_oh))

In [None]:
X_train = X_all_matrix[ids_train]
X_test = X_all_matrix[ids_test]

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

### Evaluation

In [None]:
from sklearn import cross_validation
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import f1_score

mean_accuracy = cross_validation.cross_val_score(rf, X_train, y_train, cv=10).mean()

X_train_val, X_test_val, y_train_val, y_test_val = cross_validation.train_test_split(
    X_train, y_train, test_size=0.4)

rf = RandomForestClassifier()
rf.fit(X_train_val, y_train_val)
roc_auc = roc_auc_score(y_test_val, rf.predict_proba(X_test_val)[:,1])
f1 = f1_score(y_test_val, rf.predict(X_test_val))

print "mean accuracy = {}".format(mean_accuracy)
print "roc_auc = {}".format(roc_auc)
print "F1 = {}".format(f1)

#### ROC curve

In [None]:
plt.figure(figsize=(6, 4))
plt.subplot(1, 1, 1)

fpr, tpr, thr = roc_curve(y_train, rf.predict_proba(X_train)[:,1])
plt.plot(fpr, tpr, label="auc roc curve")

plt.xlabel("false positive rate")
plt.ylabel("true positive rate")
plt.legend(loc=4)
plt.axis([-0.1, 1.1, -0.1, 1.1])
plt.show()

In [None]:
def get_meshgrid(data, step=.05, border=.5,):
    x_min, x_max = data[:, 0].min() - border, data[:, 0].max() + border
    y_min, y_max = data[:, 1].min() - border, data[:, 1].max() + border
    return np.meshgrid(np.arange(x_min, x_max, step), np.arange(y_min, y_max, step))

In [None]:
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score

colors = ListedColormap(['blue', 'yellow'])
light_colors = ListedColormap(['lightblue', 'lightyellow'])

def plot_decision_surface(estimator, train_data, train_labels, test_data, test_labels, 
                          colors = colors, light_colors = light_colors):
    estimator.fit(train_data, train_labels)
    
    pyplot.figure(figsize = (16, 6))
    
    pyplot.subplot(1,2,1)
    xx, yy = get_meshgrid(train_data)
    mesh_predictions = np.array(estimator.predict(np.c_[xx.ravel(), yy.ravel()])).reshape(xx.shape)
    pyplot.pcolormesh(xx, yy, mesh_predictions, cmap = light_colors)
    pyplot.scatter(train_data[:, 0], train_data[:, 1], c = train_labels, s = 100, cmap = colors)
    pyplot.title('Train data, accuracy={:.2f}'.format(accuracy_score(train_labels, estimator.predict(train_data))))
    
    pyplot.subplot(1,2,2)
    pyplot.pcolormesh(xx, yy, mesh_predictions, cmap = light_colors)
    pyplot.scatter(test_data[:, 0], test_data[:, 1], c = test_labels, s = 100, cmap = colors)
    pyplot.title('Test data, accuracy={:.2f}'.format(accuracy_score(test_labels, estimator.predict(test_data))))

In [None]:
plot_decision_surface(RandomForestClassifier( min_samples_leaf = 3), 
                      X_train_val[:,0:2], y_train_val, X_test_val[:,0:2], y_test_val)

### Prepare file for load on Kaggle

In [None]:
ids_arr = data_test['PassengerId'].tolist()

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

answers_arr = rf.predict(X_test).astype("int").tolist()

In [None]:
d = {'PassengerId': ids_arr, 'Survived': answers_arr}

df = pd.DataFrame(data=d, index=d['PassengerId'])
df.to_csv("answer.csv", sep=',', columns = ['PassengerId', 'Survived'],index=False)