In [22]:
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [5]:
random.seed(321)
n = 1000
age = random.choices(range(25,45), k=n)
gender = random.choices(["M","F"], k=n)
education = random.choices(["BS","MS","PhD"], k=n, weights=[0.6,0.3,0.1])
location = random.choices(["US","EU"], k=n)

owner = [True if education[i] in ["MS","PhD"] and age[i]>35 else False for i in range(n)]
income = [random.choice(range(120000,150000,1000)) if loc=="US" else random.choice(range(50000,90000,1000)) for loc in location]
price = [random.choice(range(500000,600000,10000)) if 12000<=inc<=15000 else random.choice(range(300000,400000,10000)) for inc in income]
home_value = [price[i] if owner[i]==True else np.nan for i in range(n)]

df = pd.DataFrame({"age":age,"gender":gender,"education":education,"location":location,"income":income,"owner":owner,"home_value":home_value})
df

Unnamed: 0,age,gender,education,location,income,owner,home_value
0,30,F,BS,EU,55000,False,
1,27,M,BS,EU,71000,False,
2,36,M,MS,EU,75000,True,380000.0
3,33,F,BS,US,142000,False,
4,44,F,MS,EU,80000,True,360000.0
...,...,...,...,...,...,...,...
995,43,F,PhD,US,143000,True,350000.0
996,39,F,BS,EU,74000,False,
997,41,M,BS,US,144000,False,
998,40,M,MS,US,148000,True,360000.0


In [6]:
df_gender = pd.get_dummies(df.gender, prefix='gender')
df_education = pd.get_dummies(df.education, prefix='education')
df_location = pd.get_dummies(df.location, prefix='location')
df = pd.concat([df, df_gender, df_education, df_location], axis=1) # df.reset_index(drop=True) if not matching index
# better way of categorical convert in 4_data_wrangling

le = LabelEncoder()
le.fit(df.owner.unique())
df['owner'] = le.transform(df['owner'])
df

Unnamed: 0,age,gender,education,location,income,owner,home_value,gender_F,gender_M,education_BS,education_MS,education_PhD,location_EU,location_US
0,30,F,BS,EU,55000,0,,1,0,1,0,0,1,0
1,27,M,BS,EU,71000,0,,0,1,1,0,0,1,0
2,36,M,MS,EU,75000,1,380000.0,0,1,0,1,0,1,0
3,33,F,BS,US,142000,0,,1,0,1,0,0,0,1
4,44,F,MS,EU,80000,1,360000.0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,43,F,PhD,US,143000,1,350000.0,1,0,0,0,1,0,1
996,39,F,BS,EU,74000,0,,1,0,1,0,0,1,0
997,41,M,BS,US,144000,0,,0,1,1,0,0,0,1
998,40,M,MS,US,148000,1,360000.0,0,1,0,1,0,0,1


In [13]:
cols = df_gender.columns.append(df_education.columns).append(df_location.columns).tolist()
cols.extend(['age', 'income'])

#this training data is not random predictor
#cols = df_gender.columns.append(df_location.columns).tolist()
#cols.extend(['income'])

X = df[cols]
Y = df[['owner']]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
clf = RandomForestClassifier()
clf.fit(X_train, Y_train)

  del sys.path[0]


RandomForestClassifier()

In [14]:
train_error = (clf.predict(X_train).reshape([len(X_train), 1]) - Y_train).sum()
test_error = (clf.predict(X_test).reshape([len(X_test), 1]) - Y_test).sum()
train_error, test_error

(owner    0
 dtype: int64,
 owner    0
 dtype: int64)

In [15]:
clf.score(X_train, Y_train), clf.score(X_test, Y_test)

(1.0, 1.0)

In [16]:
clf.feature_importances_

array([0.0016365 , 0.0015219 , 0.31135114, 0.12782316, 0.0335934 ,
       0.00115398, 0.00227525, 0.49396234, 0.02668235])

makes sense. age and education should be predictors.

In [27]:
confusion_matrix(Y_train, clf.predict(X_train))

array([[567,   0],
       [  0, 133]])

In [28]:
confusion_matrix(Y_test, clf.predict(X_test))

array([[248,   0],
       [  0,  52]])

In [30]:
print(classification_report(Y_train, clf.predict(X_train)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       567
           1       1.00      1.00      1.00       133

    accuracy                           1.00       700
   macro avg       1.00      1.00      1.00       700
weighted avg       1.00      1.00      1.00       700



In [31]:
print(classification_report(Y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       248
           1       1.00      1.00      1.00        52

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



In [32]:
clf.oob_score_

AttributeError: 'RandomForestClassifier' object has no attribute 'oob_score_'