# Random Forests Notebook
[Return to project overview](final_project_overview.ipynb),

### Andrew Larimer, Deepak Nagaraj, Daniel Olmstead, Michael Winton (W207-4-Summer 2018 Final Project)

In [None]:
# import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import Imputer
from util import our_train_test_split

# set default options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
%matplotlib inline

In [None]:
# import our cleaned dataset
merged_df = pd.read_csv('data_merged/combined_data_2018-07-18.csv')

# We know we need to drop some variables due to them being an irrelevant index ('Unnamed: 0'),
# or being a string, or related to calculating our dependent variable.
FEATURES_TO_DROP = ['dbn', 'school_name', \
                    'num_shsat_test_takers', 'offers_per_student', 'pct_test_takers']

# We also know that we still need to clean up some NA values. First we'll look for
# columns with too many NA values, and add those to our list of FEATURES TO DROP.
bool_of_too_many_missing_values = merged_df.isna().sum() >= 10
more_columns_to_drop = merged_df.columns[bool_of_too_many_missing_values]
total_columns_to_drop = FEATURES_TO_DROP #+ list(more_columns_to_drop)

# We'll go ahead and drop those columns.
intermediate_df = merged_df.loc[:, merged_df.columns.drop(total_columns_to_drop)]

imp = Imputer()
na_free_df = pd.DataFrame(imp.fit_transform(intermediate_df))
na_free_df.columns = intermediate_df.columns
# Now we just find the row indices of the remaining few rows for 
# which we also have missing values.
#row_indices_to_drop = intermediate_df.isna().sum(axis=1) >= 1

# And drop those to arrive at our DF free of NAs.
#na_free_df = intermediate_df.loc[-row_indices_to_drop,:]

# We now split our dependent variable from our independent variables
X = na_free_df.loc[:, na_free_df.columns.drop('high_registrations')]
y = na_free_df.loc[:,'high_registrations']

# And use our_train_test_split utility function to generate
# test and training data and labels.
train_data, test_data, train_labels, test_labels = our_train_test_split(X, y, stratify=y)

### Random Forest Model

We now move into training our random forest model.

In [None]:
# First we define a range of numbers of trees we'd like to include
# in our forests.
n_trees_to_try = [1,3,10,30,100,300,1000,3000,10000]

# For each number of trees, we'll train a model and report our results
for n_trees in n_trees_to_try:
    forest = RandomForestClassifier(n_estimators = n_trees, \
            max_features=None, n_jobs=-1, random_state=207)
    forest.fit(train_data, train_labels)
    
    predictions = forest.predict(test_data)
    f1 = f1_score(test_labels, predictions,average='weighted')
    accuracy = np.sum(predictions == test_labels) / len(test_labels)
    
    print("\nN_Trees:",n_trees)
    print("F1 Score: {0:.4f}".format(f1))
    print("Accuracy: {0:.4f}".format(accuracy))

With With a random state of 207, our best results so far are:

- F1 of .8815
- Accuracy of 88.04%

We achieve that at 100 trees and see no improvement over that with 300, 1000, 3000 or 10,000 trees.