In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
## Read in the dataset from wherever it is located
data = 'avocado-updated-2020.csv'
df = pd.read_csv(data)
df.head()


Unnamed: 0,date,average_price,total_volume,4046,4225,4770,total_bags,small_bags,large_bags,xlarge_bags,type,year,geography
0,1/4/2015,1.22,40873.28,2819.5,28287.42,49.9,9716.46,9186.93,529.53,0.0,conventional,2015,Albany
1,1/4/2015,1.79,1373.95,57.42,153.88,0.0,1162.65,1162.65,0.0,0.0,organic,2015,Albany
2,1/4/2015,1.0,435021.49,364302.39,23821.16,82.15,46815.79,16707.15,30108.64,0.0,conventional,2015,Atlanta
3,1/4/2015,1.76,3846.69,1500.15,938.35,0.0,1408.19,1071.35,336.84,0.0,organic,2015,Atlanta
4,1/4/2015,1.08,788025.06,53987.31,552906.04,39995.03,141136.68,137146.07,3990.61,0.0,conventional,2015,Baltimore/Washington


In [3]:
#Remove the non-essential columns for machine learning model
#df.drop(columns =['geography', '4046', '4225', '4770', 'total_bags', 'small_bags', 'large_bags', 'xlarge_bags', 'date', 'year'])
#ml_df = df[['average_price','total_volume', '4046', '4225','4770','total_bags','type']]
ml_df = df[['average_price','total_volume','total_bags','type']]
ml_df.head()

Unnamed: 0,average_price,total_volume,total_bags,type
0,1.22,40873.28,9716.46,conventional
1,1.79,1373.95,1162.65,organic
2,1.0,435021.49,46815.79,conventional
3,1.76,3846.69,1408.19,organic
4,1.08,788025.06,141136.68,conventional


In [4]:
#Define the features set
X = ml_df.copy()
X = X.drop("type", axis=1)
X.head()

Unnamed: 0,average_price,total_volume,total_bags
0,1.22,40873.28,9716.46
1,1.79,1373.95,1162.65
2,1.0,435021.49,46815.79
3,1.76,3846.69,1408.19
4,1.08,788025.06,141136.68


In [5]:
#Define the target set
y = ml_df["type"].values
y[:5]

array(['conventional', 'organic', 'conventional', 'organic',
       'conventional'], dtype=object)

In [6]:
y = ml_df["type"]
X = ml_df.drop(columns="type")

In [7]:
# Split the data into traing and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
X_train.shape

(22515, 3)

In [8]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(22515, 3)
(7506, 3)
(22515,)
(7506,)


In [9]:
# Splitting into Train and Test sets into an 80/20 split.
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.75)

In [18]:
# Determine the shape of our training and testing sets.
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(22515, 3)
(7506, 3)
(22515,)
(7506,)


In [19]:
#from sklearn.linear_model import LogisticRegression
#classifier = LogisticRegression(solver='lbfgs',
 #                              max_iter=200,
 #                              random_state=1)

In [20]:
#classifier.fit(X_train, y_train)

In [21]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [23]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [24]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3659,170
Actual 1,172,3505


In [25]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [26]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3659,170
Actual 1,172,3505


Accuracy Score : 0.9544364508393285
Classification Report
              precision    recall  f1-score   support

conventional       0.96      0.96      0.96      3829
     organic       0.95      0.95      0.95      3677

    accuracy                           0.95      7506
   macro avg       0.95      0.95      0.95      7506
weighted avg       0.95      0.95      0.95      7506

