In [None]:
import pandas as pd
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.grid_search import GridSearchCV
import time

In [None]:
dframe = pd.read_csv('../data/transformed.csv')

In [None]:
X = dframe.drop('OutcomeType', axis=1)
X.columns

In [None]:
y = dframe['OutcomeType']

In [None]:
rfc = RandomForestClassifier(class_weight="balanced")

In [None]:
## not sure why this isn't working

# kfold = cross_validation.KFold(len(X), n_folds=5)

# [rfc.fit(X[train], y[train]).score(X[test], y[test]) for train, test in kfold]

In [None]:
# cross_val_score() does the kfold cross val in previous cell
# scores are as good as the best scores on Kaggle; random forest classifier is a good choice
cross_validation.cross_val_score(rfc, X, y, cv=5, n_jobs=-1, scoring="recall")

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=7)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
probas = rfc.predict_proba(X_test)

Notice that probas has 5 columns. This is due to the fact that we have 5 outcomes. For each observation, we pick the outcome that has the highest probability out of the 5 probabilities we calculate.

In [None]:
probas.shape

In [None]:
log_loss(y_test, probas)

In [None]:
y_test.unique()

In [None]:
pr

In [None]:
# easy_cf(y_test, preds)

In [None]:
import 
def easy_cf(actuals, predicted):
    cf_matrix = confusion_matrix(actuals, predicted)

    '''
    true positive (TP); eqv. with hit
    false positive (FP); eqv. with false alarm, Type I error
    true negative (TN); eqv. with correct rejection
    false negative (FN); eqv. with miss, Type II error
    '''

    tn = cf_matrix[0, 0]
    fn = cf_matrix[1, 0]
    tp = cf_matrix[1, 1]
    fp = cf_matrix[0, 1]
    total = len(actuals)

    pred_pos = tp + fp
    pred_neg = tn + fn

    act_pos = tp + fn
    act_neg = tn + fp

    accuracy = (tp + tn) / float(total)

    if pred_pos == 0:
        recall = "No predicted +"
        precision = "No predicted +"
    else:
        '''
        Recall, or sensitivity (true positive rate), quantifies the avoiding of false negatives.
        '''
        recall = float(tp)/act_pos

        '''
        Precision, or positive predictive value (PPV).
        If my test predicts positive, how certain can I be that it is a true positive?
        '''
        precision = float(tp)/pred_pos


    if pred_neg == 0:
        specificity = "No predicted -"
    else:
        '''
        Specificity, or true negative rate, quantifies the avoiding of false positives.
        '''
        specificity = float(tn)/act_neg


    print "True Positive is:", tp
    print "False Positive is:", fp
    print "True Negative is:", tn
    print "False Negative is:", fn
    print

    print "Predicting + and - correctly"
    print "Acccuracy:", accuracy
    print

    print "How much can I count on my + prediction?"
    print "Precision:", precision
    print

    print "Capturing actual + and avoiding FN"
    print "Sensitivity / Hit Rate / Recall:", recall
    print

    print "Capturing actual - and avoiding FP"
    print "Specificity / True Negative Rate:", specificity
    print




In [None]:
def feat_importance(rfc_model, X_test):
    importances = rfc_model.feature_importances_
    std = np.std([tree.feature_importances_ for tree in rfc_model.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    columns = np.array([   'length', 'percent_nouns',
        'subjectivity',      'polarity',    'has_reason', 'has_therefore',
          'has_reject',   'has_applied',  'has_standard',      'has_fact',
           'has_argue',           'act',        'action',       'appeals',
                'case',         'cases',    'certiorari',       'circuit',
               'claim',        'claims',         'court',        'courts',
                 'did',      'district',          'does',      'evidence',
             'federal',         'filed',       'general',          'held',
         'information',        'joined',      'judgment',          'jury',
                 'law',         'legal',          'make',      'official',
             'opinion',    'petitioner',   'petitioners',            'pp',
         'respondents',          'rule',          'site',         'state',
              'states',       'supreme',           'tax',         'trial',
              'united'])

    col_importance = columns[indices]


    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X_test.shape[1]):
        print("%d. feature %d: %s (%f) " % (f + 1, indices[f], col_importance[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure(num=None, figsize=(20, 6), dpi=80, facecolor='w', edgecolor='k')
    plt.title("Feature importances")
    plt.bar(range(X_test.shape[1]), importances[indices],
           color="r", yerr=std[indices], align="center")
    plt.xticks(range(X_test.shape[1]), indices)
    plt.xlim([-1, X_test.shape[1]])
    plt.show()

    #Plot feature importances of the forest in seaborn
    sns.set_style("whitegrid")


# Gradient Boosting Classifier

In [None]:
# trying gradient boost model
gbm = GradientBoostingClassifier(n_estimators=100,random_state=7).fit(X_train,y_train)

In [None]:
gbm_probas = gbm.predict_proba(X_test)

In [None]:
log_loss(y_test, gbm_probas)

In [None]:
X_train.shape

It makes sense that a gradient boosting classifier works very well because with each new iteration of the estimator, it is placing more weight on the misclassified classes from the previous estimator. Because more weight is placed on previous misclassifications, GBC inherently deals with class imbalance.

# GridSearch w/ Gradient Boosting Classifier

In [None]:
# gradient boosting hyperparameters available for tweaking
gbm.get_params().keys()

In [None]:
dframe[(dframe['OutcomeType']=='Died') | (dframe['OutcomeType']=='Euthanasia')].shape[0]

In [None]:
# number of features
len(X.columns)

In [None]:
gsearch_gbc = GradientBoostingClassifier(learning_rate=0.1, 
                                         max_depth=8, 
                                         max_features="auto", 
                                         min_samples_leaf=250) # 1623 died or euthansized

In [None]:
# first set of params
param_test1 = {"n_estimators": [50,80],
               "min_samples_split": [500, 1000]}

# run grid search
gsearch1 = GridSearchCV(gsearch_gbc, param_grid=param_test1, n_jobs=-1)
start1 = time.time()
gsearch1.fit(X_train, y_train)
end1 = time.time()

In [None]:
gsearch1_probas = gsearch1.predict_proba(X_test)

In [None]:
log_loss(y_test, gsearch1_probas)

In [None]:
end1 - start1 # takes just over a minute to run params_test1

In [None]:
gsearch1.best_params_

In [None]:
# second set of params
param_test2 = {"n_estimators": [120, 200],
               "min_samples_split": [500, 1000]}

# run grid search
gsearch2 = GridSearchCV(gsearch_gbc, param_grid=param_test2, n_jobs=-1)
start2 = time.time()
gsearch2.fit(X_train, y_train)
end2 = time.time()

In [None]:
end2 - start2 # takes just over 2 minute to run params_test2

In [None]:
gsearch2_probas = gsearch2.predict_proba(X_test)

In [None]:
log_loss(y_test, gsearch2_probas)

In [None]:
gsearch2.best_params_

In [None]:
# third set of params
gsearch_gbc3 = GradientBoostingClassifier(max_depth=8, 
                                         min_samples_leaf=1000) # 1623 died or euthansized

param_test3 = { "learning_rate": [0.1, 0.05, 0.01],
                "max_features": ["sqrt", 3, 10]}

gsearch3 = GridSearchCV(gsearch_gbc3, param_grid=param_test3, n_jobs=-1)
start3 = time.time()
gsearch3.fit(X_train, y_train)
end3 = time.time()

In [None]:
end3 - start3

In [None]:
gsearch3.best_params_

In [None]:
gsearch3_probas = gsearch3.predict_proba(X_test)

In [None]:
log_loss(y_test, gsearch3_probas)

## Prepare Submission

In [None]:
sample = pd.read_csv('../data/sample_submission.csv')

In [None]:
sample.head()

In [None]:
# get predictions
test = 

In [None]:
# create submission dframe
submission = pd.DataFrame(preds, columns=['OutcomeType'])

In [None]:
# dummify predictions as integers
dummy_sub = pd.get_dummies(submission).astype(int)

In [None]:
# shorten column names
dummy_sub.columns = [col.split("_")[1] for col in dummy_sub.columns]

In [None]:
y_test