**Load Review Data**

In [2]:
import pandas as pd

df = pd.read_parquet('data/wine/wine_review.parquet.gzip')
df['review_len'] = df.description.str.len()
print(df.shape)
df[['title', 'description', 'variety', 'points', 'country', 'taster_name', 'review_len']].head()

(100538, 17)


Unnamed: 0,title,description,variety,points,country,taster_name,review_len
0,Nicosia 2013 Vulkà Bianco (Etna),"Aromas include tropical fruit, broom, brimston...",White Blend,87,Italy,Kerin O’Keefe,172
1,Quinta dos Avidagos 2011 Avidagos Red (Douro),"This is ripe and fruity, a wine that is smooth...",Portuguese Red,87,Portugal,Roger Voss,227
2,Rainstorm 2013 Pinot Gris (Willamette Valley),"Tart and snappy, the flavors of lime flesh and...",Pinot Gris,87,US,Paul Gregutt,186
3,St. Julian 2013 Reserve Late Harvest Riesling ...,"Pineapple rind, lemon pith and orange blossom ...",Riesling,87,US,Alexander Peartree,199
4,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,"Much like the regular bottling from 2012, this...",Pinot Noir,87,US,Paul Gregutt,249


**Load Embeddings**

In [3]:
mpnet = pd.read_parquet('data/wine/mpnet_embeddings.parquet.gzip')
mpnet.columns = mpnet.columns.astype(str)
mpnet.shape

(100538, 768)

**Define Feature Matrix & Target Vector**

In [5]:
X, y = pd.concat([df[['variety', 'year', 'country', 'location']], mpnet], axis=1), df.points
X.head()

Unnamed: 0,variety,year,country,location,0,1,2,3,4,5,...,758,759,760,761,762,763,764,765,766,767
0,White Blend,2013,Italy,Etna,0.036825,0.040504,-0.049127,0.01206,-0.017809,0.04519,...,-0.018225,-0.022451,-0.030674,0.030395,-0.036411,0.07518,-0.02167,0.017032,-0.024503,-0.021728
1,Portuguese Red,2011,Portugal,Douro,0.0154,0.07619,-0.017931,0.004965,-0.076234,0.021419,...,-0.071278,0.033845,-0.002066,0.018129,-0.004691,0.028105,-0.04277,0.010317,0.063025,-0.026926
2,Pinot Gris,2013,US,Oregon,0.032447,0.012291,-0.034564,-0.020621,-0.04829,0.031377,...,-0.126703,0.035472,0.029064,0.006745,-0.012355,0.030802,-0.012589,0.024286,0.023172,-0.0172
3,Riesling,2013,US,Michigan,0.076041,0.044183,-0.017158,-0.041186,-0.030582,0.051051,...,-0.063876,-0.010313,0.004829,0.009389,-0.011263,0.035524,-0.014419,0.030267,-0.036529,-0.043187
4,Pinot Noir,2012,US,Oregon,0.011352,0.057223,-0.023293,0.022438,-0.078027,0.046609,...,-0.058865,0.031108,0.006709,0.020604,-0.017218,0.023554,-0.039825,0.021364,0.025454,-0.0223


**Training/Test Split**

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)
X_train.head()

Unnamed: 0,variety,year,country,location,0,1,2,3,4,5,...,758,759,760,761,762,763,764,765,766,767
71166,Sangiovese,2004,Italy,Tuscany,-0.015517,-0.028565,-0.026781,0.0537,-0.004936,0.020011,...,-0.097514,-0.021607,0.013403,0.001048,-0.045254,0.019765,-0.018652,-0.001024,0.030262,0.033093
80735,Red Blend,2009,US,Washington,0.04097,-0.026239,-0.024915,-0.00561,-0.063496,-0.00915,...,-0.023481,0.030308,-0.01774,0.000611,0.005001,0.068376,-0.066297,0.01978,0.000633,-0.031555
25052,Sauvignon Blanc,2016,France,Côtes de Gascogne,0.068519,-0.005688,0.005931,0.014898,-0.036191,0.010095,...,-0.052174,-0.017084,0.017666,0.028166,-0.028412,0.006846,-0.05183,0.015403,0.043663,-0.062009
116451,Bordeaux-style Red Blend,2006,US,Colorado,0.04558,0.001349,-0.016296,0.006883,-0.051746,0.011897,...,-0.059892,0.024183,-0.005966,0.022143,-0.005425,0.006573,-0.025165,-0.003998,0.006135,-0.016485
14434,Pinot Noir,2013,US,California,0.013345,0.010441,-0.007903,0.00401,-0.054116,0.021753,...,-0.026164,-0.007857,-0.013207,0.018279,0.022725,0.030198,-0.043576,0.027668,-0.036869,0.010956


**Feature Engineering**

In [44]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression

transformer = ColumnTransformer([('num', StandardScaler(), ['year']),
                                 ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), ['country', 'variety', 'location'])
                                ], remainder='passthrough')

transformer

In [45]:
X_train_encoded = transformer.fit_transform(X_train)
print(transformer.get_feature_names_out()[:5])
X_train_encoded.shape

['num__year' 'cat__country_Australia' 'cat__country_Austria'
 'cat__country_Canada' 'cat__country_Chile']


(80430, 1370)

**Define Models to Train and Evaluate**

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from collections import namedtuple

In [None]:
MODEL = namedtuple('MODEL', ['model', 'params'])

models = [
  MODEL(LogisticRegression(max_iter=500), {'max_iter': 500})
]

# Naive bayes
# XGBoost
# Decision Tree
# KNearest Neighbors
# Centroid
# SVM
# Adaboost
# RandomForestClassifier
# Histogram Gradient Boosting



In [52]:
# pefformance data model
PERF = namedtuple('PERF', ['model','params','duration','accuracy','f1','confusion','hammer_loss','precision','recall','roc_auc'])

def evaluate_model(model_tuple: MODEL, X_train=X_train, y_train=y_train) -> PERF:
  model = model_tuple.model
  model_name = type(model).__name__
  # TODO time this
  pipe = Pipeline([('transform', transformer),('fit', model)]).fit(X_train, y_train).fit(X_train, y_train)
  # evaluate permance
  params = None
  duration = None
  accuracy = pipe.score(X_test, y_test)
  f1 = None
  confusion = None
  hammer_loss = None
  precision = None
  recall = None
  roc_auc = None
  return PERF(model_name,params,duration,accuracy,f1,confusion,hammer_loss,precision,recall,roc_auc)


In [53]:
results = pd.DataFrame([evaluate_model(model) for model in models])
results



Unnamed: 0,model,params,duration,accuracy,f1,confusion,hammer_loss,precision,recall,roc_auc
0,LogisticRegression,,,0.221007,,,,,,
