# ☕ How Special is Your Specialty Coffee? - Model Development

In [1]:
# Imports
import pandas as pd
import numpy as np

# sklearn imports
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

from joblib import dump, load

In [2]:
df = pd.read_csv("../data/df_combined.csv", index_col=[0])

# Delete duplicates
df=df.drop_duplicates(keep="last")

display(df.head())

print(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns")

Unnamed: 0,score,altitude,region,processing,variety
0,90.58,2075.0,Africa,Washed / Wet,Ethiopian Varieties
1,89.92,2075.0,Africa,Washed / Wet,Ethiopian Varieties
2,89.75,1700.0,Central America,Washed / Wet,Bourbon
3,89.0,2000.0,Africa,Natural / Dry,Ethiopian Varieties
4,88.83,2075.0,Africa,Washed / Wet,Ethiopian Varieties


The dataset has 1232 rows and 5 columns


In [3]:
# Binarizing "score"
df.drop(df[df["score"]<80].index, inplace=True)

for idx in df.index:
    if df.loc[idx,"score"] >=85:
        df.loc[idx, "score_binary"] = 1
    else:
        df.loc[idx, "score_binary"] = 0

# Data is imbalanced
df.loc[:,"score_binary"].value_counts()

0.0    1099
1.0     123
Name: score_binary, dtype: int64

In [4]:
# Separate majority and minority classes
df_majority = df[df["score_binary"]==0]
df_minority = df[df["score_binary"]==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=1099,    # to match majority class
                                 random_state=42) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled["score_binary"].value_counts()

1.0    1099
0.0    1099
Name: score_binary, dtype: int64

In [5]:
# Scaling numeric features
numeric_features = ["altitude"]
numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])

# Onehot encoding for categorical features
categorical_features = ["region", "processing", "variety"]
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False, handle_unknown="ignore"))])
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features),
                  ('cat', categorical_transformer, categorical_features)])

# Pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(random_state = 42))])

In [6]:
# Create feature/target matrix
features_upsampled = df_upsampled[["altitude", "region", "processing", "variety"]]
target_upsampled = df_upsampled["score_binary"]

# Split into training and test sets
features_train, features_test, target_train, target_test = train_test_split(features_upsampled, 
                                                                            target_upsampled, 
                                                                            test_size = 0.3, 
                                                                            random_state = 42)
clf.fit(features_train, target_train)

# Cross validation score
cv_score = cross_val_score(clf, features_train, target_train, cv=10, scoring="accuracy")
print(f"Mean cross validation accuracy: {np.mean(cv_score)}")

# Test score
target_pred = clf.predict(features_test)
print(f"Test accuracy: {accuracy_score(target_test, target_pred)}")

Mean cross validation accuracy: 0.8634623546388251
Test accuracy: 0.8848484848484849


In [7]:
# Save model
dump(clf, '../model/coffee_clf.joblib') 

['../model/coffee_clf.joblib']