## Import required libraries

In [1]:
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

## Read data into a variable
1. Though subtle, every string starts with an extra space,  " 39"
2. Rather read the data with argument delimiter = ", "
3. **engine="python"** to prevent warning message about regex

In [2]:
income_data = pd.read_csv("income.csv", delimiter=", ", engine="python")
income_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Column descriptions

1. **_age_** :  Age of participant.  
2. **_workclass_** :  Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
3. **_fnlwgt_** :  continuous.
4. **_education_** :  Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
5. **_education-num_** :  education in years.
6. **_marital-status_** :  Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
7. **_occupation_** :  Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
8. **_relationship_** :  Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
9. **_race_** :  White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
10. **_sex_** : Female, Male.
11. **_capital-gain_** :  income gained from other investments.
12. **_capital-loss_** :  income lost from other investments.
13. **_hours-per-week_** :  number of hours worked per week.
14. **_native-country_** :  country participant hails from

## Assign the target column: "income" to a variable "labels"

In [3]:
labels = pd.DataFrame(income_data, columns=["income"])

## So far the best columns at predicting income, assign them to the "data" variable
1. age
2. capital-gain
3. capital-loss
4. hours-per-week
5. sex

In [4]:
data = income_data[["age","capital-gain","capital-loss","hours-per-week","sex"]]

### Split data into training and test set
#### set the random_state so the values in the train and test groups are the same everytime the script is run

In [5]:
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, random_state=1)

### Create a RandomForestClassifier

In [6]:
forest = RandomForestClassifier(random_state=1)

### Train our model with the train_data and train_labels

![Train model with sex column](model2.PNG)


### There is an error since RandomForestClassifier cannot use columns containing string
1. Currently the values in the "sex" column are "Male" and "Female"
2. We would remove this column  
    2.1. Split into the 2 groups  
    2.2. Create the model  
    2.3. Train the model with the data  

In [7]:
data = income_data[["age","capital-gain","capital-loss","hours-per-week"]]
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, random_state=1)
forest = RandomForestClassifier(random_state=1)
forest.fit(train_data, train_labels.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

### Test the accuracy of the model using the test set
#### The model correctly predicted 82.22% of the time

In [8]:
print(forest.score(test_data, test_labels))

0.8222577078982926


### Convert the sex value to integer values so the model can work with it
#### Map Male:0 and Female:1

In [9]:
income_data["sex-int"] = income_data["sex"].apply(lambda x: 0 if x == "Male" else 1)
income_data["sex-int"].value_counts()

0    21790
1    10771
Name: sex-int, dtype: int64

### Now build and train the model with the "sex-int" data included
#### The score was slightly higher: 82.73% as compared to when the column was not included

In [10]:
data = income_data[["age","capital-gain","capital-loss","hours-per-week","sex-int"]]
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, random_state=1)
model = RandomForestClassifier(random_state=1)
model.fit(train_data, train_labels.values.ravel())

print(model.score(test_data, test_labels))

0.8272939442328953


### Would like to view the number of participants per country

In [11]:
print(income_data["native-country"].value_counts())

United-States                 29170
Mexico                          643
?                               583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
Greece                      

### Using apply and lambda, we set value where native-country is United-States to 0 else 1 for any other country

In [12]:
income_data["country-int"] = income_data["native-country"].apply(lambda x: 0 if x == "United-States" else 1)
income_data["country-int"].value_counts()

0    29170
1     3391
Name: country-int, dtype: int64

### Retrain the model with country-int included in the data variable
#### The score is rather low: 82.25% hence the native country column does not improve the model

In [13]:
data = income_data[["age","capital-gain","capital-loss","hours-per-week","sex-int","country-int"]]
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, random_state=1)
forest = RandomForestClassifier(random_state=1)
forest.fit(train_data, train_labels.values.ravel())

print(forest.score(test_data, test_labels))

0.8225033779633951
