In [1]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
raw_data = pd.read_csv(os.path.join("Resources", "Data", "growth_designation.csv"))
# raw_data

In [3]:
target = raw_data['GrowthOutcome']
target_names = ['Growth', 'No-Growth']

In [4]:
X = raw_data.drop(columns=['GrowthOutcome'])
y = raw_data["GrowthOutcome"]
feature_names = X.columns
print(X.shape, y.shape)

(895, 12) (895,)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# X_train

In [6]:
# X_test

In [7]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.5044642857142857

In [8]:
rf = RandomForestClassifier(n_estimators=800)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.625

In [9]:
feature_importance = sorted(zip(rf.feature_importances_, feature_names), reverse=True)
feature_importance

[(0.1149461156140583, 'ProductionValue'),
 (0.10571708939374842, 'TotalProduction'),
 (0.10355266695243368, 'Imidacloprid'),
 (0.10321059865971068, 'PricePerLB'),
 (0.1024884553938062, 'Stocks'),
 (0.10028041392245982, 'CombinedNeonic'),
 (0.09372191711015326, 'YieldPerColony'),
 (0.08727195354152362, 'ColonyCount'),
 (0.0683597781051893, 'Thiamethoxam'),
 (0.050306533737189324, 'Clothianidin'),
 (0.049328940295159635, 'Acetamiprid'),
 (0.02081553727456775, 'Thiacloprid')]

In [10]:
from sklearn.datasets import make_classification
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

X2_train, X2_test, y2_train, y2_test = train_test_split(X, y, random_state=42)

# Create Grid Search

param_grid = {'max_depth': [3, 5, 10],
              'min_samples_split': [2, 5, 10]}
base_estimator = RandomForestClassifier(random_state=0)
X, y = make_classification(n_samples=1000, random_state=0)
sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5,
                         factor=2, resource='n_estimators',
                        max_resources=30).fit(X, y)
sh.best_estimator_
RandomForestClassifier(max_depth=5, n_estimators=800, random_state=0)

ImportError: cannot import name 'enable_halving_search_cv' from 'sklearn.experimental' (C:\Users\Lauren\anaconda3\lib\site-packages\sklearn\experimental\__init__.py)

In [None]:
# Create RandomForest Classifier

kepler_186f_RFClassifier = RandomForestClassifier(max_depth=5, n_estimators=24, random_state=0)
# Fit the RandomForestClassifier to train data
kepler_186f_RFClassifier = kepler_186f_RFClassifier.fit(X2_train, y2_train)
# Score the RandomForestClassifier with test data
print(kepler_186f_RFClassifier.score(X2_test, y2_test))
feature_importance2 = sorted(zip(kepler_186f_RFClassifier.feature_importances_, feature_names), reverse=True)
feature_importance2

<h5> Create CSV files of the feature importances from both random forest models to be used in the tableau visuals</h5>

In [None]:
# Standard Random Forest Classifier
rf1_features = {}
for importance, feature in feature_importance:
    rf1_features[feature] = importance
rf1_importance_df = pd.DataFrame.from_dict(rf1_features, orient='index')


# Random Forest Classifier with Grid Search
rf2_features = {}
for importance, feature in feature_importance2:
    rf2_features[feature] = importance
rf2_importance_df = pd.DataFrame.from_dict(rf2_features, orient='index')

# rf1_importance_df
rf2_importance_df

In [None]:
joined_features = rf1_importance_df.merge(rf2_importance_df,left_index=True, right_index=True).reset_index()
joined_features = joined_features.rename(columns={"index":"Feature", "0_x":"RF", "0_y":"RF & GS"})
joined_features.to_csv("Resources/Data/rf_feature_importance.csv", index=False)
joined_features

In [None]:
from sklearn.metrics import classification_report
honeybee_prediction = rf.predict(X_test)
print(classification_report(y_test, honeybee_prediction))