<font color='blue'>Cell 1 Loading necessary packages

In [None]:
import numpy as np
import pandas as pd
import math

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

<font color='blue'>Cell 2 
Reading data and defining the dependent variable for regression analysis

In [None]:
data = pd.read_csv('data1.csv')

#select a continuous variable (float64) to be the dependent variable for regression analysis
print(data.dtypes) #View column names and types
dep_var = data.columns[0] #Edit the column number (choosing a value between 0 and 3) to select different continuous variables as your predicted variable (ie dependent variable)

sepal.length    float64
sepal.width     float64
petal.length    float64
petal.width     float64
variety          object
dtype: object


<font color='blue'>Cell 3
Data set verification

In [None]:
#Check the dataset to make sure no data is missing and check the class labels
def verify_dataset(dataset):
  #if any of the rows have missing value return datas missing
  data_found = 1
  for each_column in dataset.columns:
    if dataset[each_column].isnull().any():
            print("Data missing in Column " + each_column)
            #if any rows are not missing return Dataset is complete. No missing value
            quit()
    if data_found == 1:
        print("Dataset is complete. No missing values")
    return

verify_dataset(data)

Dataset is complete. No missing values


<font color='blue'>Cell 4
Create dummy variables for any categorical columns (ie one-hot encode)

In [None]:
#One-hot encoding for any categorical independent variables (predictors)
def get_dummy_var(dataset):
  binarized = pd.get_dummies(dataset)
  return binarized

binarized = get_dummy_var(data)
print(binarized)

     sepal.length  sepal.width  ...  variety_Versicolor  variety_Virginica
0             5.1          3.5  ...                   0                  0
1             4.9          3.0  ...                   0                  0
2             4.7          3.2  ...                   0                  0
3             4.6          3.1  ...                   0                  0
4             5.0          3.6  ...                   0                  0
..            ...          ...  ...                 ...                ...
145           6.7          3.0  ...                   0                  1
146           6.3          2.5  ...                   0                  1
147           6.5          3.0  ...                   0                  1
148           6.2          3.4  ...                   0                  1
149           5.9          3.0  ...                   0                  1

[150 rows x 7 columns]


<font color='blue'>Cell 5
Data set splitting function

In [None]:
##Splitting The Database in training and testing
def split_dataset_test_train(binarized):
    training_data = binarized.iloc[:int(0.7 * len(binarized))].reset_index(drop=True)
##Determine the integer location (iloc) from beginning of array (:) to 0.7*150 and do a ”cleanup” with a reset call
    testing_data = binarized.iloc[int(0.7 * len(binarized)):].reset_index(drop=True)
##Determine the integer location (iloc) from 0.7*150 to end of array (: ) and do a ”cleanup” with a reset call
    return [training_data, testing_data]

testtrain = split_dataset_test_train(binarized)
print(testtrain)

[     sepal.length  sepal.width  ...  variety_Versicolor  variety_Virginica
0             5.1          3.5  ...                   0                  0
1             4.9          3.0  ...                   0                  0
2             4.7          3.2  ...                   0                  0
3             4.6          3.1  ...                   0                  0
4             5.0          3.6  ...                   0                  0
..            ...          ...  ...                 ...                ...
100           6.3          3.3  ...                   0                  1
101           5.8          2.7  ...                   0                  1
102           7.1          3.0  ...                   0                  1
103           6.3          2.9  ...                   0                  1
104           6.5          3.0  ...                   0                  1

[105 rows x 7 columns],     sepal.length  sepal.width  ...  variety_Versicolor  variety_Virginica


<font color='blue'>Cell 6
Random forest regression model: build, fit and predict 

In [None]:
rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=0) #Define the model

def rfReg(train_data, test_data, dep_var):
  train_dep_var = training_data[dep_var]
  train_ind_var = training_data[training_data.columns.difference([dep_var])]
  test_ind_var = testing_data[testing_data.columns.difference([dep_var])]
  rf.fit(train_ind_var, train_dep_var) #Train the model
  predicted_test = rf.predict(test_ind_var) #Predict with the test data
  return predicted_test

<font color='blue'>Cell 7
Testing the model

In [None]:
#Let's put the model to the test! 
testing_data = testtrain[1]
training_data = testtrain[0]

predicted_test = rfReg(training_data, testing_data, dep_var) #Function above

test_dep_var = testing_data[dep_var]

#Calculate accuracy statistics
test_r2_score = r2_score(test_dep_var, predicted_test) #R squared statistic
test_r_score = math.sqrt(test_r2_score) #Pearson's R statistic
model_error = abs(predicted_test - test_dep_var) #Mean absolute error
mape = 100 * (model_error / test_dep_var) # Mean absolute percentage error (MAPE)
accuracy = 100 - np.mean(mape) #Accuracy

print('Accuracy:', round(accuracy, 2), '%')
print('Mean Absolute Error:', round(np.mean(model_error), 2), 'degrees')
print(f'Out-of-bag R-2 score estimate: {rf.oob_score_:>5.3}') #Produced when model was built
print(f'Test data R-2 score: {test_r2_score:>5.3}')
print(f'Test data R score: {test_r_score:>5.3}')

#Note that the Iris measurement variables are not well predicted by any combination of the other measurements. However, you can easily apply this script to your dataset.

Accuracy: 93.52 %
Mean Absolute Error: 0.44 degrees
Out-of-bag R-2 score estimate: 0.737
Test data R-2 score: 0.197
Test data R score: 0.444


<font color='blue'>Cell 8
Calculate relative importance of test data independent variables (predictors)

In [None]:
#Extract relative importance of each independent variable (ie predictor variables)
#The values produced are the mean decrease in the accuracy for each independent variable divided by the total mean decrease in accuracy of all independent variables
#You may want to rerun the program with the most important independent variables to improve accuracy
test_ind_var = testing_data[testing_data.columns.difference([dep_var])]
importances = list(rf.feature_importances_)
feature_list = list(test_ind_var.columns)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
for pair in feature_importances:
  print('Variable: {:20} Importance: {}'.format(*pair))

Variable: petal.length         Importance: 0.79
Variable: sepal.width          Importance: 0.13
Variable: petal.width          Importance: 0.07
Variable: variety_Setosa       Importance: 0.0
Variable: variety_Versicolor   Importance: 0.0
Variable: variety_Virginica    Importance: 0.0
