# Random Forest

## Preparación de datos explicados en Introducción

In [14]:
import pandas as pd
from sklearn import  metrics, model_selection
from sklearn.ensemble import RandomForestClassifier
from numpy import asarray

PREDICTORS = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
                                        "occupation", "relationship", "race", "sex", "capital-gain",
                                        "capital-loss", "hours-per-week", "native-country"]
PREDICTORS_STRING = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
PREDICTORS_INT = ["age", "fnlwgt", "capital-gain", "capital-loss", "hours-per-week"]
TARGET_VARIABLE = "class"

data = pd.read_csv('adult.data', names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
                                        "occupation", "relationship", "race", "sex", "capital-gain",
                                        "capital-loss", "hours-per-week", "native-country", "class"])
def from_string_to_int(key, data_set):
    data_set[key],class_names = pd.factorize(data_set[key])
    return class_names

from_string_to_int(TARGET_VARIABLE, data)
for predictor in PREDICTORS_STRING:
    from_string_to_int(predictor, data)

features = data.iloc[:,:-1]
target = data.iloc[:,-1]
features_train, features_test, target_train, target_test = model_selection.train_test_split(features, target, test_size=0.3, random_state=777)

## Entrenar

In [20]:
model1 = RandomForestClassifier(max_samples=0.2, max_features=2, n_estimators=10, max_depth=2).fit(features_train, target_train)
model2 = RandomForestClassifier(max_samples=0.5, max_features=6, n_estimators=50, max_depth=4).fit(features_train, target_train)
default_model = RandomForestClassifier().fit(features_train, target_train)

## Precisión

In [25]:
prediction1 = model1.predict(features_test)
prediction2 = model2.predict(features_test)
prediction_default = default_model.predict(features_test)

def print_numbers(name, prediction):
    count_misclassified = (target_test != prediction).sum()
    print(name)
    print('Misclassified samples: {}'.format(count_misclassified))
    accuracy = metrics.accuracy_score(target_test, prediction)
    print('Accuracy: {:.2f}'.format(accuracy))
    
print_numbers("Prediction 1", prediction1)
print_numbers("Prediction 2", prediction2)
print_numbers("Prediction default", prediction_default)

Prediction 1
Misclassified samples: 2331
Accuracy: 0.76
Prediction 2
Misclassified samples: 1494
Accuracy: 0.85
Prediction default
Misclassified samples: 1388
Accuracy: 0.86


## Predicciones

In [26]:
test_prediction=pd.DataFrame(data=features_test, columns=PREDICTORS)
test_prediction['Target']=target_test
test_prediction['Prediction']=prediction_default
test_prediction.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Target,Prediction
29305,32,2,130040,1,9,2,2,0,0,0,0,0,40,0,0,0
4110,25,2,186294,1,9,0,0,0,0,1,0,0,50,0,0,0
12044,39,2,237943,0,13,1,5,1,0,0,0,0,30,0,1,0
17027,78,6,237294,1,9,6,5,0,0,0,0,0,45,0,1,0
5821,53,2,157069,6,12,1,9,1,0,0,0,0,40,0,1,0


In [27]:
def get_predicted_class_name(class_number):
    if(class_number == 0):
        return "<=50K"
    return ">50K"

#Greater data: 37, Private, 280464, Some-college, 10, Married-civ-spouse,Exec-managerial, Husband, Black, Male, 0, 0, 80, United-States, >50K
custom_row_greater = [37,2,280464,5, 10, 1, 1,1,1, 0, 0,0, 80, 0]
custom_row_greater = asarray([custom_row_greater])
yhat_greater = default_model.predict(custom_row_greater)
print('Prediction greater: %s' % get_predicted_class_name(yhat_greater[0]))

#Less data: 49, Private, 160187, 9th, 5, Married-spouse-absent, Other-service, Not-in-family, Black, Female, 0, 0, 16, Jamaica, <=50K
custom_row_less = [49,2,160187,4,5, 3, 4, 0, 1, 1, 0, 0, 16, 2]
custom_row_less = asarray([custom_row_less])
yhat_less = default_model.predict(custom_row_less)
print('Prediction less: %s' % get_predicted_class_name(yhat_less[0]))

Prediction greater: >50K
Prediction less: <=50K
