In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### <font color="red"> X = features, features variable, data</font> 
### <font color="red"> y = label, targets, target variables, ground truth</font> 

In [33]:
# import model
from sklearn.ensemble import RandomForestClassifier

# import model selection
from sklearn.model_selection import train_test_split


# load the data
heart_disease = pd.read_csv("heart-disease.csv")

# setup random seed
np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease['target']


# split the data into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# initialize the classifier
model = RandomForestClassifier()

# fit the data
model.fit(X_train, y_train)

# find the score of the test data
model.score(X_test, y_test)

0.8524590163934426

### Making predictions based on our model
#### 2 ways to make prediction
#### 1 -> predict() 
#### 2 -> predict_proba()

In [34]:
# 1 way


# model.predict(np.array([1,2,3,4,5,6,7])) # this does not work
model.predict(X_test)

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [35]:
np.array(y_test)

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [36]:
# compare predictions to truth labels to evaluate the model
y_preds = model.predict(X_test)
np.mean(y_preds == y_test)


0.8524590163934426

In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

0.8524590163934426

# Make predictions with `predict_proba()`


In [38]:
model.predict_proba(X_test[:5])

array([[0.89, 0.11],
       [0.49, 0.51],
       [0.43, 0.57],
       [0.84, 0.16],
       [0.18, 0.82]])

In [39]:
model.predict(X_test[:5])

array([0, 1, 1, 0, 1])

# `predict()` can also be used for regression algorithms

In [40]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing_df = pd.DataFrame(housing['data'], columns=housing["feature_names"])
housing_df.head()
housing_df['target'] = housing['target']
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [41]:
from sklearn.model_selection import train_test_split

np.random.seed(42)

# create Data
X = housing_df.drop("target", axis=1)
y = housing_df["target"]

# split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [42]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor()

In [45]:
y_preds = model.predict(X_test)

In [47]:
y_preds[:5]

array([0.4943   , 0.7642   , 4.9346864, 2.56266  , 2.29764  ])

In [49]:
np.array(y_test[:5])

array([0.477  , 0.458  , 5.00001, 2.186  , 2.78   ])

In [50]:
len(y_preds), len(y_test)

(4128, 4128)

In [52]:
# Compare the predictions to the truth
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_preds, y_test)

0.32670527078488387