# Random Forest

## Read Dataset "crop_yield.csv"

In [4]:
import numpy as np
import pandas as pd

df = pd.read_csv('crop_yield.csv')
df

Unnamed: 0,Region,Soil_Type,Crop,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Weather_Condition,Days_to_Harvest,Yield_tons_per_hectare
0,West,Sandy,Cotton,897.077239,27.676966,False,True,Cloudy,122,6.555816
1,South,Clay,Rice,992.673282,18.026142,True,True,Rainy,140,8.527341
2,North,Loam,Barley,147.998025,29.794042,False,False,Sunny,106,1.127443
3,North,Sandy,Soybean,986.866331,16.644190,False,True,Rainy,146,6.517573
4,South,Silt,Wheat,730.379174,31.620687,True,True,Cloudy,110,7.248251
...,...,...,...,...,...,...,...,...,...,...
999995,West,Silt,Rice,302.805345,27.987428,False,False,Sunny,76,1.347586
999996,South,Chalky,Barley,932.991383,39.661039,True,False,Rainy,93,7.311594
999997,North,Peaty,Cotton,867.362046,24.370042,True,False,Cloudy,108,5.763182
999998,West,Silt,Wheat,492.812857,33.045505,False,False,Sunny,102,2.070159


## Feature Matrix and Label Vector

In [7]:
feature_cols = ["Region","Soil_Type","Crop","Rainfall_mm","Temperature_Celsius","Fertilizer_Used","Irrigation_Used","Weather_Condition","Days_to_Harvest"]
X = df[feature_cols]
y = df["Yield_tons_per_hectare"]
from sklearn.preprocessing import OneHotEncoder
dummies = ["Region","Soil_Type","Crop","Weather_Condition"]
X_cat = pd.get_dummies(df[dummies])
X_new = pd.concat([X,X_cat], axis=1).drop(columns = dummies)
X_new

Unnamed: 0,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Days_to_Harvest,Region_East,Region_North,Region_South,Region_West,Soil_Type_Chalky,...,Soil_Type_Silt,Crop_Barley,Crop_Cotton,Crop_Maize,Crop_Rice,Crop_Soybean,Crop_Wheat,Weather_Condition_Cloudy,Weather_Condition_Rainy,Weather_Condition_Sunny
0,897.077239,27.676966,False,True,122,False,False,False,True,False,...,False,False,True,False,False,False,False,True,False,False
1,992.673282,18.026142,True,True,140,False,False,True,False,False,...,False,False,False,False,True,False,False,False,True,False
2,147.998025,29.794042,False,False,106,False,True,False,False,False,...,False,True,False,False,False,False,False,False,False,True
3,986.866331,16.644190,False,True,146,False,True,False,False,False,...,False,False,False,False,False,True,False,False,True,False
4,730.379174,31.620687,True,True,110,False,False,True,False,False,...,True,False,False,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,302.805345,27.987428,False,False,76,False,False,False,True,False,...,True,False,False,False,True,False,False,False,False,True
999996,932.991383,39.661039,True,False,93,False,False,True,False,True,...,False,True,False,False,False,False,False,False,True,False
999997,867.362046,24.370042,True,False,108,False,True,False,False,False,...,False,False,True,False,False,False,False,True,False,False
999998,492.812857,33.045505,False,False,102,False,False,False,True,False,...,True,False,False,False,False,False,True,False,False,True


## RF Regressor / n_estimators = 20 / random_state = 2

In [10]:
from sklearn.ensemble import RandomForestRegressor

my_RandomForest = RandomForestRegressor(n_estimators = 20, bootstrap = True, random_state=2)

## Importances

In [16]:
importances = my_RandomForest.feature_importances_
importance_df = pd.DataFrame({
    'Feature': X_new.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending = False)

print(importance_df)

                     Feature  Importance
0                Rainfall_mm    0.606006
2            Fertilizer_Used    0.195932
3            Irrigation_Used    0.124729
1        Temperature_Celsius    0.028019
4            Days_to_Harvest    0.016048
6               Region_North    0.001598
5                Region_East    0.001595
8                Region_West    0.001586
7               Region_South    0.001578
23   Weather_Condition_Sunny    0.001562
21  Weather_Condition_Cloudy    0.001561
22   Weather_Condition_Rainy    0.001559
18                 Crop_Rice    0.001530
10            Soil_Type_Clay    0.001529
9           Soil_Type_Chalky    0.001527
13           Soil_Type_Sandy    0.001521
16               Crop_Cotton    0.001520
14            Soil_Type_Silt    0.001520
19              Crop_Soybean    0.001517
12           Soil_Type_Peaty    0.001515
15               Crop_Barley    0.001513
20                Crop_Wheat    0.001513
11            Soil_Type_Loam    0.001513
17              

## Training/Testing Split @ 0.2 & Training

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=2)
my_RandomForest.fit(X_train,y_train)

### Predict and Acccuracy

In [19]:
y_predict_RF = my_RandomForest.predict(X_test)
print(y_predict_RF)

[2.97796209 6.63060058 7.43604143 ... 3.15028883 4.02680909 4.64113096]


In [21]:
from sklearn.metrics import accuracy_score

score_RF = my_RandomForest.score(X_test,y_test)
print(score_RF)

0.9038346039241637


In [46]:
from sklearn import metrics

mse = metrics.mean_squared_error(y_test, y_predict_RF)

rmse = np.sqrt(mse)

print(rmse)

0.5262149511909312


## Training/Testing Split @ 0.15/ 100 estimators  & Training

In [50]:
from sklearn.ensemble import RandomForestRegressor

my_RandomForest = RandomForestRegressor(n_estimators = 100, bootstrap = True, random_state=2)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.15, random_state=2)
my_RandomForest.fit(X_train,y_train)

In [52]:
y_predict_RF_2 = my_RandomForest.predict(X_test)
score_RF_2 = my_RandomForest.score(X_test,y_test)
print(score_RF_2)

0.9073469103456607


In [53]:
mse = metrics.mean_squared_error(y_test, y_predict_RF_2)

rmse = np.sqrt(mse)

print(rmse)

0.5169534133313272


## Training/Testing Split @ 0.15/ 50 estimators  & Training

In [57]:

my_RandomForest = RandomForestRegressor(n_estimators = 50, bootstrap = True, random_state=2)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.15, random_state=2)
my_RandomForest.fit(X_train,y_train)

In [39]:
y_predict_RF_3 = my_RandomForest.predict(X_test)
score_RF_3 = my_RandomForest.score(X_test,y_test)
print(score_RF_3)

0.9064624295608745


In [59]:
mse = metrics.mean_squared_error(y_test, y_predict_RF_3)

rmse = np.sqrt(mse)

print(rmse)

0.5194150113743107
