## Using RF For Regression

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('petrol_consumption.csv')

In [3]:
dataset.head()

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410


In [4]:
# Prepare the data for training
x = dataset.iloc[:, 0:4].values
y = dataset.iloc[:, 4].values

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [6]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [9]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=200, random_state=0)
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)

In [10]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 48.33899999999999
Mean Squared Error: 3494.2330150000003
Root Mean Squared Error: 59.112037818028234


#### Might have to add more trees to decrease the error. Changed n_estimators from 20 to 200

## Using RF For Classification

In [11]:
dataset_class = pd.read_csv('bill_authentication.csv')

In [12]:
dataset_class.head()

Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [13]:
# Prepare the data for training
x = dataset_class.iloc[:, 0:4].values
y = dataset_class.iloc[:, 4].values

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [15]:
# Feature Scaling
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [16]:
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)

In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

# Data63.com Tutorial

In [19]:
possum_df = pd.read_csv('possum.csv')
possum_df.sample(5, random_state=44)

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
88,89,7,other,m,6.0,97.7,58.4,84.5,35.0,64.4,46.2,14.4,29.0,30.5
39,40,2,Vic,f,3.0,91.0,55.0,84.5,36.0,72.8,51.4,13.6,27.0,30.0
92,93,7,other,m,3.0,89.2,54.0,82.0,38.0,63.8,44.9,12.8,24.0,31.0
7,8,1,Vic,f,6.0,94.8,57.6,91.0,37.0,72.7,53.9,14.5,29.0,34.0
98,99,7,other,f,3.0,93.3,56.2,86.5,38.5,64.8,43.8,14.0,28.0,35.0


In [20]:
possum_df = possum_df.dropna()

In [21]:
possum_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101 entries, 0 to 103
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      101 non-null    int64  
 1   site      101 non-null    int64  
 2   Pop       101 non-null    object 
 3   sex       101 non-null    object 
 4   age       101 non-null    float64
 5   hdlngth   101 non-null    float64
 6   skullw    101 non-null    float64
 7   totlngth  101 non-null    float64
 8   taill     101 non-null    float64
 9   footlgth  101 non-null    float64
 10  earconch  101 non-null    float64
 11  eye       101 non-null    float64
 12  chest     101 non-null    float64
 13  belly     101 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 11.8+ KB


In [23]:
# Separating the features (x) and the labels (y)
x = possum_df.drop(["case", "site", "Pop", "sex"], axis=1)
y = possum_df['sex']

In [24]:
# Create the random forest
x_train, x_test, y_train, y_test = train_test_split(x , y, test_size=0.3, random_state=44)
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=50, max_features="auto", random_state=44)
rf_model.fit(x_train, y_train)

  warn(


In [25]:
# Make predictions with the model
predictions = rf_model.predict(x_test)
predictions

array(['f', 'm', 'm', 'm', 'f', 'f', 'm', 'm', 'm', 'm', 'm', 'f', 'm',
       'm', 'm', 'm', 'm', 'f', 'm', 'm', 'm', 'f', 'f', 'm', 'm', 'm',
       'm', 'm', 'm', 'm', 'f'], dtype=object)

In [27]:
rf_model.predict_proba(x_test)

array([[0.5 , 0.5 ],
       [0.26, 0.74],
       [0.36, 0.64],
       [0.3 , 0.7 ],
       [0.66, 0.34],
       [0.54, 0.46],
       [0.48, 0.52],
       [0.42, 0.58],
       [0.18, 0.82],
       [0.34, 0.66],
       [0.48, 0.52],
       [0.54, 0.46],
       [0.4 , 0.6 ],
       [0.28, 0.72],
       [0.48, 0.52],
       [0.36, 0.64],
       [0.4 , 0.6 ],
       [0.54, 0.46],
       [0.44, 0.56],
       [0.06, 0.94],
       [0.32, 0.68],
       [0.56, 0.44],
       [0.66, 0.34],
       [0.46, 0.54],
       [0.18, 0.82],
       [0.46, 0.54],
       [0.34, 0.66],
       [0.26, 0.74],
       [0.2 , 0.8 ],
       [0.1 , 0.9 ],
       [0.5 , 0.5 ]])

#### Each array contains two probabilities since it has two categories to predict: male or female

In [28]:
# Show the order of the classes
rf_model.classes_

array(['f', 'm'], dtype=object)

In [30]:
rf_model.feature_importances_

array([0.05237143, 0.1461762 , 0.08878178, 0.11322144, 0.07150935,
       0.16099193, 0.11990223, 0.11082213, 0.05286153, 0.08336197])

In [None]:
importances = rf_model.feature_importances_
columns = x.columns
i = 0

while i < len(columns):
    print(f" The importance of feature '{columns[i]}' is {round(importances[i] * 100, 2)}%.")
    i += 1

In [None]:
new_possum = [[7.0, 83.2, 54.3, 81.0, 37.0, 70.0, 46.3, 14.7, 25.0, 32.0]]