# <center> RANDOM FORESTS <br/>
## <center> CSCAR WORKSHOP - Data Science Skills Series <br/><br/> 10/20/2017
### <center> Marcio Mourao


## Some info about the dataset

This data was extracted from the 1994 Census bureau database by Ronny Kohavi and Barry Becker (Data Mining and Visualization, Silicon Graphics). A set of reasonably clean records was extracted using the following conditions: ((AAGE>16) && (AGI>100) && (AFNLWGT>1) && (HRSWK>0)). 

<b>The prediction task is to determine whether a person makes over $50K a year!</b>

<b>Attributes:</b>

income: >50K, <=50K

age: continuous

workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked

fnlwgt: continuous

education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool

education-num: continuous

marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse

occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces

relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried

race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black

sex: Female, Male

capital-gain: continuous

capital-loss: continuous

hours-per-week: continuous

native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands

## Import relevant general modules

In [None]:
#Load some relevant modules
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load and describe the data

In [None]:
#Creates a dataframe named "adults" from reading the file "adult.csv"
adults = pd.read_csv('adult.csv',na_values=['?'])
adults.head()

In [None]:
#Displays number of lines and number of columns of the dataframe
adults.shape

In [None]:
#Displays the data types associated with each dataframe column
adults.dtypes

In [None]:
#Create a numpy array with object features
objFeatures = adults.select_dtypes(include=['object']).columns.values
print(objFeatures)

#Convert objects to categoricals
for col in objFeatures:
    adults[col] = adults[col].astype('category')

In [None]:
#Displays the data types associated with each dataframe column
adults.dtypes

In [None]:
#Describes everything in the dataframe
adults.describe(include='all')

In [None]:
#Displays whether columns contain any null values
adults.isnull().any(axis=0)

In [None]:
#Count the number of missing values in each column of the dataframe
adults.apply(lambda x: sum(x.isnull()),axis=0)

In [None]:
#Count the number of missing values in each column of the dataframe and sums them up
adults.apply(lambda x: sum(x.isnull()),axis=0).sum()

In [None]:
#Count number of lines with NaNs
adults.apply(lambda x: x.isnull().any(),axis=1).sum()

In [None]:
#Fraction of observations with NaNs (potentially for removal)
2399/adults.shape[0]

In [None]:
#Removes any lines from the dataframe that contains NaNs 
#(be careful about what you decide to do with missing values)
adults=adults.dropna(axis=0,how='any')
adults.head()

In [None]:
#Displays number of lines and number of columns of the dataframe
adults.shape

In [None]:
#Displays the first rows of the dataframe
adults.head(10)

## Machine Learning

In [None]:
#Just checking the version
import sklearn
print(sklearn.__version__)

In [None]:
#Import modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics

In [None]:
#Before moving forward, lets drop potentially irrelevant features of the data
#Feel free to go back here, drop or add features
adults2=adults.drop(['workclass','fnlwgt','education','capital.gain','capital.loss','native.country'], axis=1)

In [None]:
#Create a numpy array with object features
catFeatures = adults2.select_dtypes(include=['category']).columns.values

#Factorize category features
for col in catFeatures:
    adults2[col + '_enc']=pd.factorize(adults2[col].values)[0]

#Remove initial object columns
adults2 = adults2.drop(catFeatures,axis=1)

In [None]:
#Check new data types
adults2.dtypes

In [None]:
adults2.describe(include='all')

In [None]:
##### PRIMARY APPROACH #####

#Apply OneHot encoding to the categorical features
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(categorical_features=np.arange(3,8), sparse=False)
adults3 = enc.fit_transform(adults2.values)
print(enc.n_values_)
print(enc.feature_indices_)
print(enc.active_features_)
print(adults3.shape)

In [None]:
##### PRIMARY APPROACH #####

#Define covariates in X and dependent variable in y
X = adults3[:,np.arange(0,adults3.shape[1]-1)]
y = adults3[:,adults3.shape[1]-1]

print(X.shape)
print(y.shape)

In [None]:
##### ALTERNATIVE APPROACH #####

#Define covariates in X and dependent variable in y
X = adults2[['age','education.num','hours.per.week','marital.status_enc','occupation_enc',
            'relationship_enc','race_enc','sex_enc']]
y = adults2.income_enc

In [None]:
#Obtain the data for the fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

print('Total number of records: ', adults2.shape[0])
print('Type of X_train: ', type(X_train))
print('Number of records in X_train: ', len(X_train))
print('Fraction on X_train: ', len(X_train)/adults2.shape[0])
print('Number of records in y_train: ', len(y_train))
print('Type of y_train: \n\n', type(y_train))

print('Type of X_test: ', type(X_test))
print('Number of records in X_test: ', len(X_test))
print('Fraction on X_test: ', len(X_test)/adults2.shape[0])
print('Number of records in y_test: ', len(y_test))
print('Type of y_test: ', type(y_test))

In [None]:
#Creates a RF classification model
RF_model = RandomForestClassifier(n_estimators=10, criterion='gini')

#Fit to the data
RF_model.fit(X_train, y_train)

In [None]:
#Obtain class predictions
y_pred_RF_prob = RF_model.predict_proba(X_test)
print('Predicted probabilities: \n', y_pred_RF_prob)

#Obtain probability predictions
y_pred_RF_class = RF_model.predict(X_test)
print('Predicted classes: \n', y_pred_RF_class)

In [None]:
#Obtains accuracy score
print('RF Score: ', metrics.accuracy_score(y_test, y_pred_RF_class))

In [None]:
#Obtains confusion matrix
RF_cm=metrics.confusion_matrix(y_test,y_pred_RF_class)
RF_cm

In [None]:
##### PRIMARY APPROACH #####

#Capture feature importance only for the continuous values of the data
feature_imp=RF_model.feature_importances_[34:37]

#Create plot of feature importance
positions = np.arange(len(feature_imp))
plt.barh(positions, feature_imp, align='center')
plt.xlabel("Feature Importances")
plt.ylabel("Features")
plt.yticks(positions, ('Age','Education','Hours per Week'))
plt.grid(True)

In [None]:
##### ALTERNATIVE APPROACH #####

#Capture feature importance from the RF model
feature_imp=RF_model.feature_importances_

#Create plot of feature importance
positions = np.arange(len(feature_imp))
plt.barh(positions, feature_imp, align='center')
plt.xlabel("Feature Importances")
plt.ylabel("Features")
plt.yticks(positions, ('age','education.num','hours.per.week','marital.status_enc','occupation_enc',
            'relationship_enc','race_enc','sex_enc'))
plt.grid(True)

In [None]:
#KFolds and Cross_val_scores
kf = KFold(n_splits=10, shuffle=True)
print('Cross validation score: ', cross_val_score(RF_model, X, y, cv=kf).mean())