In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

# Read in the files __`train.csv`__ and __`test.csv`__ from the data directory into two separate dataframes

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
train

In [None]:
test

### Investigate the structure of the data

In [None]:
train.head()

# Do some Data Cleanup on the Training data
### Capture the __`healed`__ column as a separate data frame and remove the __`PatientId`__, __`Name`__, and __`healed`__ columns from the training data
* (You will need to specify the axis)

In [None]:
output = train[['healed']]
train = train.drop(['PatientId', 'Name', 'healed'], axis=1)

### Turn the training data __`Sex`__ column into binary features

In [None]:
train = pd.get_dummies(train, columns=['Sex'], drop_first=True)

### Verify the new structure of the training data

In [None]:
train

### Fill the null age values with something reasonable

In [None]:
train = train.fillna(train.mean())

# Create a RandomForestClassifier and train it on the training data and output __`healed`__ column

In [None]:
forest = RandomForestClassifier(n_jobs=2,n_estimators=50)
forest.fit(train, output['healed'])

### Clean the Test Data Like you did for the Training data
1. Pull the __`healed`__ column out of the test data and then remove it, the __`PatiendId`__ and __`Name`__ columns from the test data
* Turn the test data __`Sex`__ column into binary features

In [None]:
test_output = test[['healed']]
test = test.drop(['PatientId', 'Name', 'healed'], axis=1)
test = pd.get_dummies(test, columns=['Sex'], drop_first=True)

### Verify the structure of the test data

In [None]:
test

# Generate predictions for the test data from the RandomForestClassifier and look at them

In [None]:
preds = forest.predict(test)
preds

### Copy the code from the earlier RandomForestClassifier for getting the importances, indices, and features

In [None]:
importances = forest.feature_importances_
indices = np.argsort(importances)
features = train.columns[0:4]

### Copy the code from the earlier RandomForestClassifier example for plotting the Features importances

In [None]:
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance');

### Generate a crosstab table as in the earlier RandomForestClassifier example 

In [None]:
pd.crosstab(index=test_output['healed'], columns=preds, rownames=['actual'], colnames=['preds'])

# If you have time, change the manner in which you handled missing age data and see if that impacts the predictions