In [1]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
raw_data = pd.read_csv(os.path.join("Resources", "Data", "growth_designation.csv"))
# raw_data

In [3]:
target = raw_data['GrowthOutcome']
target_names = ['Growth', 'No-Growth']

In [4]:
X = raw_data.drop(columns=['GrowthOutcome'])
y = raw_data["GrowthOutcome"]
feature_names = X.columns
print(X.shape, y.shape)

(895, 12) (895,)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# X_train

In [6]:
# X_test

In [7]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.49107142857142855

In [8]:
rf = RandomForestClassifier(n_estimators=800)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.6428571428571429

In [9]:
feature_importance = sorted(zip(rf.feature_importances_, feature_names), reverse=True)
feature_importance

[(0.11186537245449633, 'ProductionValue'),
 (0.10726656990452271, 'TotalProduction'),
 (0.10541548239764326, 'PricePerLB'),
 (0.10355820277117402, 'Stocks'),
 (0.10293043565762663, 'Imidacloprid'),
 (0.09878921787654292, 'CombinedNeonic'),
 (0.09200817734046858, 'YieldPerColony'),
 (0.08847388895534404, 'ColonyCount'),
 (0.06902956463959636, 'Thiamethoxam'),
 (0.050857160123523495, 'Acetamiprid'),
 (0.04910454890976588, 'Clothianidin'),
 (0.020701378969295685, 'Thiacloprid')]

In [10]:
from sklearn.datasets import make_classification
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

X2_train, X2_test, y2_train, y2_test = train_test_split(X, y, random_state=42)

# Create Grid Search

param_grid = {'max_depth': [3, 5, 10],
              'min_samples_split': [2, 5, 10]}
base_estimator = RandomForestClassifier(random_state=0)
X, y = make_classification(n_samples=1000, random_state=0)
sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5,
                         factor=2, resource='n_estimators',
                        max_resources=30).fit(X, y)
sh.best_estimator_
RandomForestClassifier(max_depth=5, n_estimators=800, random_state=0)

RandomForestClassifier(max_depth=5, n_estimators=800, random_state=0)

In [11]:
# Create RandomForest Classifier

kepler_186f_RFClassifier = RandomForestClassifier(max_depth=5, n_estimators=24, random_state=0)
# Fit the RandomForestClassifier to train data
kepler_186f_RFClassifier = kepler_186f_RFClassifier.fit(X2_train, y2_train)
# Score the RandomForestClassifier with test data
print(kepler_186f_RFClassifier.score(X2_test, y2_test))
feature_importance2 = sorted(zip(kepler_186f_RFClassifier.feature_importances_, feature_names), reverse=True)
feature_importance2

0.6428571428571429


[(0.1303758133915868, 'TotalProduction'),
 (0.12387689836702231, 'ProductionValue'),
 (0.11319546561805559, 'ColonyCount'),
 (0.11176609905722622, 'Thiamethoxam'),
 (0.11066149855042025, 'PricePerLB'),
 (0.09408924009400775, 'Imidacloprid'),
 (0.09179574678692493, 'Stocks'),
 (0.09062767512653663, 'YieldPerColony'),
 (0.06142337023373653, 'CombinedNeonic'),
 (0.026612863064090154, 'Clothianidin'),
 (0.02452014957498339, 'Acetamiprid'),
 (0.021055180135409506, 'Thiacloprid')]

<h5> Create CSV files of the feature importances from both random forest models to be used in the tableau visuals</h5>

In [12]:
# Standard Random Forest Classifier
rf1_features = {}
for importance, feature in feature_importance:
    rf1_features[feature] = importance
rf1_importance_df = pd.DataFrame.from_dict(rf1_features, orient='index')


# Random Forest Classifier with Grid Search
rf2_features = {}
for importance, feature in feature_importance2:
    rf2_features[feature] = importance
rf2_importance_df = pd.DataFrame.from_dict(rf2_features, orient='index')

# rf1_importance_df
rf2_importance_df

Unnamed: 0,0
TotalProduction,0.130376
ProductionValue,0.123877
ColonyCount,0.113195
Thiamethoxam,0.111766
PricePerLB,0.110661
Imidacloprid,0.094089
Stocks,0.091796
YieldPerColony,0.090628
CombinedNeonic,0.061423
Clothianidin,0.026613


In [21]:
joined_features = rf1_importance_df.merge(rf2_importance_df,left_index=True, right_index=True).reset_index()
joined_features = joined_features.rename(columns={"index":"Feature", "0_x":"RF", "0_y":"RF & GS"})
joined_features.to_csv("Resources/Data/rf_feature_importance.csv", index=False)
joined_features

Unnamed: 0,Feature,RF,RF & GS
0,ProductionValue,0.111865,0.123877
1,TotalProduction,0.107267,0.130376
2,PricePerLB,0.105415,0.110661
3,Stocks,0.103558,0.091796
4,Imidacloprid,0.10293,0.094089
5,CombinedNeonic,0.098789,0.061423
6,YieldPerColony,0.092008,0.090628
7,ColonyCount,0.088474,0.113195
8,Thiamethoxam,0.06903,0.111766
9,Acetamiprid,0.050857,0.02452


In [14]:
from sklearn.metrics import classification_report
honeybee_prediction = rf.predict(X_test)
print(classification_report(y_test, honeybee_prediction))

              precision    recall  f1-score   support

           0       0.66      0.88      0.75       139
           1       0.57      0.25      0.34        85

    accuracy                           0.64       224
   macro avg       0.61      0.57      0.55       224
weighted avg       0.62      0.64      0.60       224

