## Mathews notebook for Machine Learning SVC modeling

### Load All Necessary Model Modules

In [1]:
# Load Starter Modules

# Python package for dataframe creation and manipulation
import pandas as pd
# Python package for low level math functions
import numpy as np
import os

In [2]:
# Load RandomForest Modules

# Python package for statistical modeling: RandomForests with Grid Search
from sklearn.datasets import make_classification
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Load SVC Modules

# Python package for statistical modeling: support vector classifier
from sklearn.svm import SVC 
# Python package for statistical modeling: Scale dataset
from sklearn.preprocessing import StandardScaler
# Python package for statistical modeling: split dataset
from sklearn.model_selection import train_test_split
# Python package for statistical modeling: print report
from sklearn.metrics import classification_report

### Create Dataset

In [4]:
# Read File

honeybee_csv = pd.read_csv(os.path.join('Resources', 'Data', 'growth_designation.csv'))
honeybee_csv.head()

Unnamed: 0,ColonyCount,TotalProduction,YieldPerColony,PricePerLB,ProductionValue,Stocks,Clothianidin,Imidacloprid,Thiamethoxam,Acetamiprid,Thiacloprid,CombinedNeonic,GrowthOutcome
0,16000.0,928000.0,58,0.69,640000.0,28000.0,0.0,716.5,0.0,0.0,0.0,716.5,0
1,15000.0,960000.0,64,0.87,835000.0,96000.0,0.0,371.6,0.0,0.0,0.0,371.6,0
2,14000.0,924000.0,66,0.81,748000.0,92000.0,0.0,6704.8,0.0,0.0,0.0,6704.8,0
3,16000.0,1136000.0,71,0.72,818000.0,159000.0,0.0,1836.3,0.0,0.0,0.0,1836.3,1
4,17000.0,1156000.0,68,0.56,647000.0,185000.0,0.0,1251.2,0.0,0.0,0.0,1251.2,1


In [5]:
# Target Outcome Column

target = honeybee_csv['GrowthOutcome']
target_names = ['Growth', 'No-Growth']

In [6]:
# Drop Column

honeybee_data = honeybee_csv.drop(['GrowthOutcome'], axis=1)
honeybee_names = honeybee_data.columns
honeybee_data.head()

Unnamed: 0,ColonyCount,TotalProduction,YieldPerColony,PricePerLB,ProductionValue,Stocks,Clothianidin,Imidacloprid,Thiamethoxam,Acetamiprid,Thiacloprid,CombinedNeonic
0,16000.0,928000.0,58,0.69,640000.0,28000.0,0.0,716.5,0.0,0.0,0.0,716.5
1,15000.0,960000.0,64,0.87,835000.0,96000.0,0.0,371.6,0.0,0.0,0.0,371.6
2,14000.0,924000.0,66,0.81,748000.0,92000.0,0.0,6704.8,0.0,0.0,0.0,6704.8
3,16000.0,1136000.0,71,0.72,818000.0,159000.0,0.0,1836.3,0.0,0.0,0.0,1836.3
4,17000.0,1156000.0,68,0.56,647000.0,185000.0,0.0,1251.2,0.0,0.0,0.0,1251.2


In [7]:
# Split Data

x_train, x_test, y_train, y_test = train_test_split(honeybee_data, target, random_state=42)

### Perform Grid Search

In [8]:
# Create Grid Search

param_grid = {'max_depth': [3, 5, 10],
              'min_samples_split': [2, 5, 10]}
base_estimator = RandomForestClassifier(random_state=0)
x, y = make_classification(n_samples=1000, random_state=0)
sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5,
                         factor=2, resource='n_estimators',
                        max_resources=30).fit(x, y)
sh.best_estimator_
RandomForestClassifier(max_depth=5, n_estimators=800, random_state=0)

RandomForestClassifier(max_depth=5, n_estimators=800, random_state=0)

### Create Random Forest

In [9]:
# Create RandomForest Classifier

kepler_186f_RFClassifier = RandomForestClassifier(max_depth=5, n_estimators=24, random_state=0)
# Fit the RandomForestClassifier to train data
kepler_186f_RFClassifier = kepler_186f_RFClassifier.fit(x_train, y_train)
# Score the RandomForestClassifier with test data
print(kepler_186f_RFClassifier.score(x_test, y_test))
sorted(zip(kepler_186f_RFClassifier.feature_importances_, honeybee_names), reverse=True)

0.6428571428571429


[(0.1303758133915868, 'TotalProduction'),
 (0.12387689836702231, 'ProductionValue'),
 (0.11319546561805559, 'ColonyCount'),
 (0.11176609905722622, 'Thiamethoxam'),
 (0.11066149855042025, 'PricePerLB'),
 (0.09408924009400775, 'Imidacloprid'),
 (0.09179574678692493, 'Stocks'),
 (0.09062767512653663, 'YieldPerColony'),
 (0.06142337023373653, 'CombinedNeonic'),
 (0.026612863064090154, 'Clothianidin'),
 (0.02452014957498339, 'Acetamiprid'),
 (0.021055180135409506, 'Thiacloprid')]

### SVC Model 1 Build

In [10]:
# Create SVC Classifier

honeybee_svc_model = SVC(kernel='rbf', cache_size=5000)
#.fit(x_train, y_train)

In [11]:
# Fit Model

honeybee_svc_model.fit(x_train, y_train)

SVC(cache_size=5000)

In [12]:
# Scale Data

scaler = StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [13]:
# SVC Model Accuracy

print(f'Test accuracy: %.3f' % honeybee_svc_model.score(x_train, y_train))
print(f'Actual accuracy: %.3f' % honeybee_svc_model.score(x_test, y_test))

Test accuracy: 0.671
Actual accuracy: 0.661


In [14]:
# Calculate Report

honeybee_prediction = honeybee_svc_model.predict(x_test)
print(classification_report(y_test, honeybee_prediction,
                            target_names = target_names))

              precision    recall  f1-score   support

      Growth       0.68      0.92      0.78       149
   No-Growth       0.48      0.15      0.22        75

    accuracy                           0.66       224
   macro avg       0.58      0.53      0.50       224
weighted avg       0.61      0.66      0.60       224



In [15]:
predictions = honeybee_svc_model.predict(x_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 1 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 1, 0, 1, 0, 0, 0, 0, 1, 0]


In [16]:
pd.DataFrame({"PredictedGrowth": predictions, "ActualGrowth": y_test}).reset_index(drop=True)

Unnamed: 0,PredictedGrowth,ActualGrowth
0,0,0
1,1,1
2,0,0
3,0,1
4,0,0
...,...,...
219,0,1
220,0,1
221,0,0
222,0,0


### SVC Model 2 Build 

In [17]:
# Keep Only

honeybee_data2 = honeybee_csv[['TotalProduction', 'ProductionValue', 'ColonyCount','Thiamethoxam','PricePerLB']]
honeybee_names = honeybee_data.columns
honeybee_data2.head()

Unnamed: 0,TotalProduction,ProductionValue,ColonyCount,Thiamethoxam,PricePerLB
0,928000.0,640000.0,16000.0,0.0,0.69
1,960000.0,835000.0,15000.0,0.0,0.87
2,924000.0,748000.0,14000.0,0.0,0.81
3,1136000.0,818000.0,16000.0,0.0,0.72
4,1156000.0,647000.0,17000.0,0.0,0.56


In [18]:
# Split Data

x_train, x_test, y_train, y_test = train_test_split(honeybee_data2, target, random_state=42)

In [19]:
# Create SVC Classifier

honeybee_svc_model2 = SVC(kernel='rbf', cache_size=5000).fit(x_train, y_train)

In [20]:
# Scale Data

scaler = StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [21]:
# SVC Model Accuracy

print(f'Test accuracy: %.3f' % honeybee_svc_model2.score(x_train, y_train))
print(f'Actual accuracy: %.3f' % honeybee_svc_model2.score(x_test, y_test))

Test accuracy: 0.671
Actual accuracy: 0.656


In [22]:
# Calculate Report

honeybee_prediction = honeybee_svc_model2.predict(x_test)
print(classification_report(y_test, honeybee_prediction,
                            target_names = target_names))

              precision    recall  f1-score   support

      Growth       0.68      0.90      0.78       149
   No-Growth       0.46      0.17      0.25        75

    accuracy                           0.66       224
   macro avg       0.57      0.54      0.51       224
weighted avg       0.61      0.66      0.60       224



In [23]:
predictions2 = honeybee_svc_model2.predict(x_test)
print(f"First 10 Predictions:   {predictions2[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 1 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 1, 0, 1, 0, 0, 0, 0, 1, 0]


In [24]:
pd.DataFrame({"Predicted Growth": predictions2, "Actual_Growth": y_test}).reset_index(drop=True)

Unnamed: 0,Predicted Growth,Actual_Growth
0,0,0
1,1,1
2,0,0
3,0,1
4,0,0
...,...,...
219,0,1
220,0,1
221,0,0
222,0,0


### SVC Model 3 Build 

In [25]:
# Keep Only

honeybee_data3 = honeybee_csv[['Clothianidin', 'Imidacloprid', 'Thiamethoxam','Acetamiprid','Thiacloprid']]
honeybee_names = honeybee_data.columns
honeybee_data3.head()

Unnamed: 0,Clothianidin,Imidacloprid,Thiamethoxam,Acetamiprid,Thiacloprid
0,0.0,716.5,0.0,0.0,0.0
1,0.0,371.6,0.0,0.0,0.0
2,0.0,6704.8,0.0,0.0,0.0
3,0.0,1836.3,0.0,0.0,0.0
4,0.0,1251.2,0.0,0.0,0.0


In [26]:
# Split Data

x_train, x_test, y_train, y_test = train_test_split(honeybee_data3, target, random_state=1)

In [27]:
# Create SVC Classifier

honeybee_svc_model3 = SVC(kernel='rbf', cache_size=5000).fit(x_train, y_train)

In [28]:
# Scale Data

scaler = StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [29]:
# SVC Model Accuracy

print(f'Test accuracy: %.3f' % honeybee_svc_model3.score(x_train, y_train))
print(f'Actual accuracy: %.3f' % honeybee_svc_model3.score(x_test, y_test))

Test accuracy: 0.668
Actual accuracy: 0.625


In [30]:
# Calculate Report

honeybee_prediction = honeybee_svc_model3.predict(x_test)
print(classification_report(y_test, honeybee_prediction,
                            target_names = target_names))

              precision    recall  f1-score   support

      Growth       0.62      0.99      0.77       139
   No-Growth       0.67      0.02      0.05        85

    accuracy                           0.62       224
   macro avg       0.65      0.51      0.41       224
weighted avg       0.64      0.62      0.49       224



In [31]:
predictions3 = honeybee_svc_model3.predict(x_test)
print(f"First 10 Predictions:   {predictions3[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 1, 0, 0, 1, 0, 0, 1, 1]


In [32]:
pd.DataFrame({"Predicted Growth": predictions3, "Actual_Growth": y_test}).reset_index(drop=True)

Unnamed: 0,Predicted Growth,Actual_Growth
0,0,0
1,0,0
2,0,1
3,0,0
4,0,0
...,...,...
219,0,0
220,1,1
221,0,0
222,0,0


### SVC Model 4 Build 

In [33]:
# Keep Only

honeybee_data4 = honeybee_csv[["CombinedNeonic"]]
honeybee_names = honeybee_data.columns
honeybee_data4.head()

Unnamed: 0,CombinedNeonic
0,716.5
1,371.6
2,6704.8
3,1836.3
4,1251.2


In [34]:
# Split Data

x_train, x_test, y_train, y_test = train_test_split(honeybee_data4, target, random_state=1)

In [35]:
# Create SVC Classifier

honeybee_svc_model4 = SVC(kernel='rbf', cache_size=5000).fit(x_train, y_train)

In [36]:
# Scale Data

scaler = StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [37]:
# SVC Model Accuracy

print(f'Test accuracy: %.3f' % honeybee_svc_model4.score(x_train, y_train))
print(f'Actual accuracy: %.3f' % honeybee_svc_model4.score(x_test, y_test))

Test accuracy: 0.657
Actual accuracy: 0.616


In [38]:
# Calculate Report

honeybee_prediction = honeybee_svc_model4.predict(x_test)
print(classification_report(y_test, honeybee_prediction,
                            target_names = target_names))

              precision    recall  f1-score   support

      Growth       0.62      0.99      0.76       139
   No-Growth       0.33      0.01      0.02        85

    accuracy                           0.62       224
   macro avg       0.48      0.50      0.39       224
weighted avg       0.51      0.62      0.48       224



In [39]:
predictions4 = honeybee_svc_model4.predict(x_test)
print(f"First 10 Predictions:   {predictions4[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 1, 0, 0, 1, 0, 0, 1, 1]


In [40]:
pd.DataFrame({"Predicted Growth": predictions4, "Actual_Growth": y_test}).reset_index(drop=True)

Unnamed: 0,Predicted Growth,Actual_Growth
0,0,0
1,0,0
2,0,1
3,0,0
4,0,0
...,...,...
219,0,0
220,1,1
221,0,0
222,0,0


### SVC Model 5 Build 

In [41]:
# Keep Only

honeybee_data5 = honeybee_csv[['ColonyCount', 'TotalProduction', 'YieldPerColony', 'PricePerLB', 'ProductionValue', 'Stocks']]
honeybee_names = honeybee_data.columns
honeybee_data5.head()

Unnamed: 0,ColonyCount,TotalProduction,YieldPerColony,PricePerLB,ProductionValue,Stocks
0,16000.0,928000.0,58,0.69,640000.0,28000.0
1,15000.0,960000.0,64,0.87,835000.0,96000.0
2,14000.0,924000.0,66,0.81,748000.0,92000.0
3,16000.0,1136000.0,71,0.72,818000.0,159000.0
4,17000.0,1156000.0,68,0.56,647000.0,185000.0


In [42]:
# Split Data

x_train, x_test, y_train, y_test = train_test_split(honeybee_data5, target, random_state=1)

In [43]:
# Create SVC Classifier

honeybee_svc_model5 = SVC(kernel='rbf', cache_size=5000).fit(x_train, y_train)

In [44]:
# Scale Data

scaler = StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [45]:
# SVC Model Accuracy

print(f'Test accuracy: %.3f' % honeybee_svc_model5.score(x_train, y_train))
print(f'Actual accuracy: %.3f' % honeybee_svc_model5.score(x_test, y_test))

Test accuracy: 0.677
Actual accuracy: 0.625


In [46]:
# Calculate Report

honeybee_prediction = honeybee_svc_model5.predict(x_test)
print(classification_report(y_test, honeybee_prediction,
                            target_names = target_names))

              precision    recall  f1-score   support

      Growth       0.63      0.97      0.76       139
   No-Growth       0.56      0.06      0.11        85

    accuracy                           0.62       224
   macro avg       0.59      0.52      0.43       224
weighted avg       0.60      0.62      0.51       224



In [47]:
predictions5 = honeybee_svc_model5.predict(x_test)
print(f"First 10 Predictions:   {predictions5[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 1 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 1, 0, 0, 1, 0, 0, 1, 1]


In [48]:
pd.DataFrame({"Predicted Growth": predictions5, "Actual_Growth": y_test}).reset_index(drop=True)

Unnamed: 0,Predicted Growth,Actual_Growth
0,0,0
1,0,0
2,0,1
3,1,0
4,0,0
...,...,...
219,0,0
220,0,1
221,0,0
222,0,0
