<a href="https://colab.research.google.com/github/lovelyoyrmia/machine-learning-notebook/blob/main/random_forest_adult.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Random Forest Adult**

## Import dependencies

In [1]:
import pandas as pd
import numpy as np
import sklearn.tree as tree
import sklearn.metrics as met
import sklearn.model_selection as ms
import sklearn.ensemble as ens

## Load dataset

In [2]:
rf = ens.RandomForestClassifier(n_estimators=100)
df1 = pd.read_csv('adult_data.csv')

In [3]:
df1.head(10)

Unnamed: 0,age,workclass,fnlwft,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours-per-week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


## Data Cleaning

In [4]:
df1.isnull().sum()

age               0
workclass         0
fnlwft            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours-per-week    0
native_country    0
income            0
dtype: int64

In [5]:
df1.dropna(inplace=True)

## Encode Label

In [6]:
import sklearn.preprocessing as prep

In [10]:
gender = prep.LabelEncoder()
gender.fit(df1['sex'])
df1['sex_code'] = gender.transform(df1['sex'])
df1[['sex', 'sex_code']].head()

Unnamed: 0,sex,sex_code
0,Male,1
1,Male,1
2,Male,1
3,Male,1
4,Female,0


In [11]:
education = prep.LabelEncoder()
education.fit(df1['education'])
df1['education_code'] = education.transform(df1['education'])
df1[['education', 'education_code']].head()

Unnamed: 0,education,education_code
0,Bachelors,9
1,Bachelors,9
2,HS-grad,11
3,11th,1
4,Bachelors,9


In [12]:
race = prep.LabelEncoder()
race.fit(df1['race'])
df1['race_code'] = race.transform(df1['race'])
df1[['race', 'race_code']].head()

Unnamed: 0,race,race_code
0,White,4
1,White,4
2,White,4
3,Black,2
4,Black,2


In [13]:
workclass = prep.LabelEncoder()
workclass.fit(df1['workclass'])
df1['workclass_code'] = workclass.transform(df1['workclass'])
df1[['workclass', 'workclass_code']].head()

Unnamed: 0,workclass,workclass_code
0,State-gov,7
1,Self-emp-not-inc,6
2,Private,4
3,Private,4
4,Private,4


In [14]:
occupation = prep.LabelEncoder()
occupation.fit(df1['occupation'])
df1['occupation_code'] = occupation.transform(df1['occupation'])
df1[['occupation', 'occupation_code']].head()

Unnamed: 0,occupation,occupation_code
0,Adm-clerical,1
1,Exec-managerial,4
2,Handlers-cleaners,6
3,Handlers-cleaners,6
4,Prof-specialty,10


In [15]:
relationship = prep.LabelEncoder()
relationship.fit(df1['relationship'])
df1['relationship_code'] = relationship.transform(df1['relationship'])
df1[['relationship', 'relationship_code']].head()

Unnamed: 0,relationship,relationship_code
0,Not-in-family,1
1,Husband,0
2,Not-in-family,1
3,Husband,0
4,Wife,5


In [20]:
native_country = prep.LabelEncoder()
native_country.fit(df1['native_country'])
df1['native_country_code'] = native_country.transform(df1['native_country'])
df1[['native_country', 'native_country_code']].head()

Unnamed: 0,native_country,native_country_code
0,United-States,39
1,United-States,39
2,United-States,39
3,United-States,39
4,Cuba,5


In [21]:
df1.columns

Index(['age', 'workclass', 'fnlwft', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours-per-week', 'native_country',
       'income', 'sex_code', 'education_code', 'race_code', 'workclass_code',
       'occupation_code', 'relationship_code', 'native_country_code'],
      dtype='object')

## Split data train and data test

In [25]:
X = df1.drop(['fnlwft', 'workclass', 'education', 'marital_status', 
              'occupation', 'relationship', 'race', 'sex', 'native_country', 'income'], axis=1)
y = df1['income']

In [26]:
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2)

In [27]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [28]:
y_predict = rf.predict(X_test)
y_predict

array([' <=50K', ' >50K.', ' >50K', ..., ' >50K', ' <=50K', ' >50K'],
      dtype=object)

## Report score accuracy data train

In [29]:
print('Accuracy = ', met.accuracy_score(y_test, y_predict))

Accuracy =  0.5167366158255707


In [30]:
met.classification_report(y_test, y_predict)

'              precision    recall  f1-score   support\n\n       <=50K       0.60      0.75      0.66      4915\n      <=50K.       0.31      0.19      0.24      2466\n        >50K       0.49      0.50      0.49      1609\n       >50K.       0.21      0.13      0.16       779\n\n    accuracy                           0.52      9769\n   macro avg       0.40      0.39      0.39      9769\nweighted avg       0.48      0.52      0.49      9769\n'

In [31]:
rf.feature_importances_

array([0.2952974 , 0.06230392, 0.06121843, 0.02627635, 0.15971406,
       0.01632478, 0.03977019, 0.03250675, 0.06724846, 0.12202505,
       0.08175861, 0.035556  ])