In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import os

In [2]:
raw_data = pd.read_csv(os.path.join("Resources", "Data", "growth_designation.csv"))
raw_data

Unnamed: 0,ColonyCount,TotalProduction,YieldPerColony,PricePerLB,ProductionValue,Stocks,Clothianidin,Imidacloprid,Thiamethoxam,Acetamiprid,Thiacloprid,CombinedNeonic,GrowthOutcome
0,16000.0,928000.0,58,0.69,640000.0,28000.0,0.0,716.5,0.0,0.0,0.0,716.5,0
1,15000.0,960000.0,64,0.87,835000.0,96000.0,0.0,371.6,0.0,0.0,0.0,371.6,0
2,14000.0,924000.0,66,0.81,748000.0,92000.0,0.0,6704.8,0.0,0.0,0.0,6704.8,0
3,16000.0,1136000.0,71,0.72,818000.0,159000.0,0.0,1836.3,0.0,0.0,0.0,1836.3,1
4,17000.0,1156000.0,68,0.56,647000.0,185000.0,0.0,1251.2,0.0,0.0,0.0,1251.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
890,50000.0,2550000.0,51,1.87,4769000.0,459000.0,1174.9,277.0,423.3,0.0,0.0,1875.2,1
891,47000.0,3102000.0,66,2.11,6545000.0,558000.0,840.9,155.9,526.2,0.0,0.0,1523.0,0
892,38000.0,2318000.0,61,2.08,4821000.0,255000.0,1262.0,258.2,566.1,0.0,0.0,2086.3,0
893,38000.0,2926000.0,77,1.90,5559000.0,146000.0,0.0,114.4,19.0,0.0,0.0,133.4,0


In [3]:
classifier = LogisticRegression(max_iter=500)
classifier

LogisticRegression(max_iter=500)

<h3 style="color:green">Model 1</h3>
<strong>Features: </strong>All columns

In [4]:
X1 = raw_data.drop(columns=['GrowthOutcome'])
y1 = raw_data["GrowthOutcome"]
print(X1.shape, y1.shape)

(895, 12) (895,)


In [5]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=1)

In [6]:
classifier.fit(X1_train, y1_train)

LogisticRegression(max_iter=500)

In [7]:
print(f"Training Data Score: {classifier.score(X1_train, y1_train)}")
print(f"Testing Data Score: {classifier.score(X1_test, y1_test)}")

Training Data Score: 0.6065573770491803
Testing Data Score: 0.5892857142857143


In [8]:
predictions1 = classifier.predict(X1_test)
print(f"First 10 Predictions:   {predictions1[:10]}")
print(f"First 10 Actual labels: {y1_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 1 0 0 0]
First 10 Actual labels: [0, 0, 1, 0, 0, 1, 0, 0, 1, 1]


In [9]:
pd.DataFrame({"Prediction": predictions1, "Actual": y1_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,1
3,0,0
4,0,0
...,...,...
219,1,0
220,0,1
221,0,0
222,0,0


<h3 style="color:green">Model 2</h3>
<strong>Features: </strong>ProductionValue, TotalProduction, ImidaclopridLB, Stocks, PricePerLB
<br>(top five from random forest model)

In [10]:
X2 = raw_data[['ProductionValue', 'TotalProduction', 'Imidacloprid','Stocks','PricePerLB']]
y2 = raw_data["GrowthOutcome"]
# X2.head()
# y2.head()
print(X2.shape, y2.shape)

(895, 5) (895,)


In [11]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=1)

In [12]:
classifier.fit(X2_train, y2_train)
print(f"Training Data Score: {classifier.score(X2_train, y2_train)}")
print(f"Testing Data Score: {classifier.score(X2_test, y2_test)}")

Training Data Score: 0.6229508196721312
Testing Data Score: 0.5803571428571429


In [13]:
predictions2 = classifier.predict(X2_test)
print(f"First 10 Predictions:   {predictions2[:10]}")
print(f"First 10 Actual labels: {y2_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 1, 0, 0, 1, 0, 0, 1, 1]


In [14]:
pd.DataFrame({"Prediction": predictions2, "Actual": y2_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,1
3,0,0
4,0,0
...,...,...
219,0,0
220,0,1
221,0,0
222,1,0


<h3 style="color:green">Model 3</h3>
<strong>Features: </strong> ClothianidinLB, ImidaclopridLB, ThiamethoxamLB, AcetamipridLB, ThiaclopridLB
<br>(each neonic pesticide)

In [15]:
X3 = raw_data[['Clothianidin', 'Imidacloprid', 'Thiamethoxam','Acetamiprid','Thiacloprid']]
y3 = raw_data["GrowthOutcome"]
# X3.head()
# y3.head()
print(X3.shape, y3.shape)

(895, 5) (895,)


In [16]:
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, random_state=1)
classifier.fit(X3_train, y3_train)
print(f"Training Data Score: {classifier.score(X3_train, y3_train)}")
print(f"Testing Data Score: {classifier.score(X3_test, y3_test)}")

Training Data Score: 0.6602086438152012
Testing Data Score: 0.6116071428571429


In [17]:
predictions3 = classifier.predict(X3_test)
print(f"First 10 Predictions:   {predictions3[:10]}")
print(f"First 10 Actual labels: {y3_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 1 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 1, 0, 0, 1, 0, 0, 1, 1]


In [18]:
pd.DataFrame({"Prediction": predictions3, "Actual": y3_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,1
3,1,0
4,0,0
...,...,...
219,0,0
220,1,1
221,0,0
222,0,0


<h3 style="color:green">Model 4</h3>
<strong>Features: </strong> CombinedNeonic

In [19]:
X4 = raw_data[["CombinedNeonic"]]
y4 = raw_data["GrowthOutcome"]
# X4.head()
# y4.head()
print(X4.shape, y4.shape)

(895, 1) (895,)


In [20]:
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, random_state=1)
classifier.fit(X4_train, y4_train)
print(f"Training Data Score: {classifier.score(X4_train, y4_train)}")
print(f"Testing Data Score: {classifier.score(X4_test, y4_test)}")

Training Data Score: 0.6497764530551415
Testing Data Score: 0.6205357142857143


In [21]:
predictions4 = classifier.predict(X4_test)
print(f"First 10 Predictions:   {predictions4[:10]}")
print(f"First 10 Actual labels: {y4_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 1, 0, 0, 1, 0, 0, 1, 1]


In [22]:
pd.DataFrame({"Prediction": predictions4, "Actual": y4_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,1
3,0,0
4,0,0
...,...,...
219,0,0
220,0,1
221,0,0
222,0,0


<h3 style="color:green">Model 5</h3>
<strong>Features: </strong> ColonyCount, TotalProduction, YieldPerColony, PricePerLB, ProductionValue, Stocks
<br> (excludes all neonic pesticide data)

In [23]:
X5 = raw_data.drop(columns=['Clothianidin', 'Imidacloprid', 'Thiamethoxam','Acetamiprid','Thiacloprid', 'CombinedNeonic'])
y5 = raw_data["GrowthOutcome"]
# X5.head()
# y5.head()
print(X5.shape, y5.shape)

(895, 7) (895,)


In [24]:
X5_train, X5_test, y5_train, y5_test = train_test_split(X5, y5, random_state=1)
classifier.fit(X5_train, y5_train)
print(f"Training Data Score: {classifier.score(X5_train, y5_train)}")
print(f"Testing Data Score: {classifier.score(X5_test, y5_test)}")

Training Data Score: 0.6080476900149031
Testing Data Score: 0.6160714285714286


In [25]:
predictions5 = classifier.predict(X5_test)
print(f"First 10 Predictions:   {predictions5[:10]}")
print(f"First 10 Actual labels: {y5_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 1 0 0 0]
First 10 Actual labels: [0, 0, 1, 0, 0, 1, 0, 0, 1, 1]


In [26]:
pd.DataFrame({"Prediction": predictions5, "Actual": y5_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,1
3,0,0
4,0,0
...,...,...
219,0,0
220,1,1
221,0,0
222,0,0


<h2 style= "color:green">Model Review</h2>
<strong>Model 1:</strong><br>
<ul> Training Data Score: 0.6104477611940299</ul>
<ul>Testing Data Score: 0.6116071428571429</ul>
<strong>Model 2</strong>
<ul> Training Data Score: 0.5970149253731343</ul>
<ul>Testing Data Score: 0.5580357142857143</ul>
<strong>Model 3</strong>
<ul> Training Data Score: 0.6447761194029851</ul>
<ul>Testing Data Score: 0.6517857142857143</ul>
<strong>Model 4</strong>
<ul> Training Data Score: 0.6417910447761194</ul>
<ul>Testing Data Score: 0.6651785714285714</ul>
<strong>Model 5</strong>
<ul> Training Data Score: 0.5850746268656717</ul>
<ul>Testing Data Score: 0.5758928571428571</ul>