In [92]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [93]:
from pydataset import data
from sklearn.model_selection import train_test_split

In [75]:
# obtain/acquire our dataframe
df = data('iris')

In [76]:
df.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa


In [77]:
#Prepare our data

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sepal.Length  150 non-null    float64
 1   Sepal.Width   150 non-null    float64
 2   Petal.Length  150 non-null    float64
 3   Petal.Width   150 non-null    float64
 4   Species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 7.0+ KB


In [83]:
# data looks ok at first glance -- we will proceed (with caution)
#split our data
train, test = train_test_split(df, train_size=0.8, 
                               random_state=1349, 
                               stratify=df.Species)

In [84]:
train, validate = train_test_split(train, train_size=0.7, 
                                   random_state=1349, 
                                   stratify=train.Species)

In [85]:
train.shape, validate.shape, test.shape

((84, 5), (36, 5), (30, 5))

In [86]:
# separate our X and y (independent vs dependent variables)
X_train = train.drop(columns = 'Species')

In [87]:
y_train = train[['Species']]

In [88]:
X_train.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
139,6.0,3.0,4.8,1.8
8,5.0,3.4,1.5,0.2
80,5.7,2.6,3.5,1.0
75,6.4,2.9,4.3,1.3
98,6.2,2.9,4.3,1.3


In [35]:
# Lets make our model!

In [94]:
# make our thing   clf = classifier
clf = RandomForestClassifier()

In [95]:
# fit the thing (ONLY ON TRAIN!)
clf.fit(X_train, y_train)

RandomForestClassifier()

In [96]:
# use the thing (on in-sample data)
y_pred = clf.predict(X_train)

In [97]:
y_pred

array(['virginica', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'setosa', 'versicolor', 'setosa',
       'setosa', 'setosa', 'virginica', 'setosa', 'virginica',
       'virginica', 'versicolor', 'virginica', 'versicolor', 'versicolor',
       'setosa', 'versicolor', 'setosa', 'virginica', 'versicolor',
       'setosa', 'virginica', 'setosa', 'virginica', 'setosa',
       'versicolor', 'virginica', 'setosa', 'setosa', 'virginica',
       'setosa', 'virginica', 'versicolor', 'versicolor', 'virginica',
       'versicolor', 'virginica', 'versicolor', 'setosa', 'virginica',
       'virginica', 'setosa', 'virginica', 'virginica', 'virginica',
       'versicolor', 'setosa', 'setosa', 'setosa', 'virginica',
       'versicolor', 'versicolor', 'setosa', 'versicolor', 'versicolor',
       'setosa', 'virginica', 'versicolor', 'virginica', 'setosa',
       'virginica', 'virginica', 'setosa', 'virginica', 'setosa',
       'versicolor', 'versicolor', 'setosa

In [98]:
pd.DataFrame(classification_report(y_train.Species, 
                                   y_pred, output_dict=True))

Unnamed: 0,setosa,versicolor,virginica,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0
support,28.0,28.0,28.0,1.0,84.0,84.0


In [99]:
clf.score(X_train, y_train)

1.0

In [100]:
y_pred_val = clf.predict(validate.drop(columns='Species'))

In [101]:
clf.score(validate.drop(columns='Species'), y_pred_val)

1.0

In [103]:
X_train.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
139,6.0,3.0,4.8,1.8
8,5.0,3.4,1.5,0.2
80,5.7,2.6,3.5,1.0
75,6.4,2.9,4.3,1.3
98,6.2,2.9,4.3,1.3


In [104]:
df = data ('mpg')

In [105]:
df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [108]:
train, test = train_test_split(df, train_size=0.8, 
                               random_state=1349, 
                               stratify=df.drv)
train, validate = train_test_split(train, train_size=0.7, 
                                   random_state=1349, 
                                   stratify=train.drv)

In [111]:
#make the thing:
clf = RandomForestClassifier(max_depth=2)

In [112]:
# fit the thing:
# Dont fit on the entire dateaset.  Just train
clf.fit(train[['displ', 'cty','hwy']], train.drv)

RandomForestClassifier(max_depth=2)

In [113]:
# try to predict the drv based on city and hwy

y_pred = clf.predict(train[['displ', 'cty','hwy']])

In [114]:
# train
clf.score(train[['displ', 'cty','hwy']], train.drv)

0.7692307692307693

In [116]:
# validate
clf.score(validate[['displ', 'cty', 'hwy']], validate.drv)

0.8771929824561403

In [117]:
# y prediction probability
y_pred_proba = clf.predict_proba(train[['displ', 'cty','hwy']])

In [118]:
# count
train.drv.value_counts()

f    59
4    57
r    14
Name: drv, dtype: int64

In [119]:
y_pred_proba

array([[0.8490005 , 0.01477312, 0.13622637],
       [0.28353032, 0.66384042, 0.05262926],
       [0.85659091, 0.01779207, 0.12561702],
       [0.22633299, 0.74283363, 0.03083338],
       [0.20200992, 0.76228801, 0.03570207],
       [0.45558661, 0.18973538, 0.35467801],
       [0.22627198, 0.73935629, 0.03437173],
       [0.44163543, 0.19749729, 0.36086728],
       [0.27781435, 0.66833687, 0.05384877],
       [0.35782267, 0.56632922, 0.0758481 ],
       [0.63141528, 0.04194506, 0.32663965],
       [0.33794919, 0.48115543, 0.18089538],
       [0.80609247, 0.10604883, 0.08785871],
       [0.11205487, 0.87707079, 0.01087433],
       [0.27119812, 0.59089914, 0.13790274],
       [0.34128531, 0.59046606, 0.06824863],
       [0.44969974, 0.30178333, 0.24851693],
       [0.8761418 , 0.01526093, 0.10859727],
       [0.37211931, 0.37214015, 0.25574054],
       [0.29077842, 0.60653167, 0.10268991],
       [0.75687011, 0.01299314, 0.23013675],
       [0.47196695, 0.34659734, 0.18143571],
       [0.