In [None]:
# Install packages directly with pip in current env (not necessary if you set up things with Conda)
%pip install -q soorgeon==0.0.20
%pip install -q soopervisor==0.9.3
%pip install -q transformers==4.48.3
%pip install -q torch==2.6.0
%pip install -q matplotlib==3.10.0
%pip install -q numpy==2.2.2
%pip install -q pandas==2.2.3
%pip install -q ploomber==0.23.3
%pip install -q scikit-learn==1.6.1
%pip install -q xgboost==2.1.4

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
import torch
import xgboost as xgb

## Data preprocessing

In [None]:
# CQI data (from web scraping)
cqi_df = pd.read_csv('data/cqi_5_23.csv')

# Review data (from database)
rev_df = pd.read_csv('data/rev_5_23.csv')

# Columns that are missing from rev_df. These are the columns that we will be predicting.
FEATURE_COLUMNS = ["Aroma", "Flavor", "Aftertaste", "Acidity", "Body"]
MISSING_COLUMNS = ["Balance", "Uniformity", "Clean Cup", "Sweetness"]

In [None]:
# Rename columns 'aroma','sour','body','flavor','aftertaste' in Aroma, Acidty, Body, Flavor, Aftertaste
rev_df.rename(
    columns={
        "aroma": "Aroma",
        "acid": "Acidity",
        "body": "Body",
        "flavor": "Flavor",
        "aftertaste": "Aftertaste",
    },
    inplace=True,
)

rev_df.columns

## Model fitting

In [None]:
# Predict missing columns
X = cqi_df[["Aroma", "Flavor", "Aftertaste", "Acidity", "Body"]]
y = cqi_df[["Balance", "Uniformity", "Clean Cup", "Sweetness"]]

# XGBoost
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, multi_strategy="one_output_per_tree")
# Look ma, no train-test split!
xgb_model.fit(X, y)

## Missing value prediction

In [None]:
# Predict missing values
MISSING_COLUMNS = ["Balance", "Uniformity", "Clean Cup", "Sweetness"]
rev_df[MISSING_COLUMNS] = xgb_model.predict(rev_df[["Aroma", "Flavor", "Aftertaste", "Acidity", "Body"]])

## Text embedding

In [None]:
# Embedding model
MODEL_NAME = "TaylorAI/gte-tiny"

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

tokenizer = AutoTokenizer.from_pretrained(f'{MODEL_NAME}')
model = AutoModel.from_pretrained(f'{MODEL_NAME}')


In [None]:
description_columns = ['desc_1', 'desc_2', 'desc_3']
embeddings = []
for desc_col in description_columns:
    rev_df[desc_col] = rev_df[desc_col].fillna('')
    encoded_input = tokenizer(rev_df[desc_col].to_list(), padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings.append(mean_pooling(model_output, encoded_input['attention_mask']))
stacked_embeddings = torch.hstack(embeddings)
embeddings_reduced = PCA(n_components=0.67).fit_transform(stacked_embeddings)


In [None]:
# Same as above, but on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

description_columns = ['desc_1', 'desc_2', 'desc_3']
embeddings = []
for desc_col in description_columns:
    rev_df[desc_col] = rev_df[desc_col].fillna('')
    encoded_input = tokenizer(rev_df[desc_col].to_list(), padding=True, truncation=True, return_tensors='pt')
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    embeddings.append(mean_pooling(model_output, encoded_input['attention_mask']))
stacked_embeddings = torch.hstack(embeddings).to("cpu")
embeddings_reduced = PCA(n_components=0.67).fit_transform(stacked_embeddings)


## Feature combination

In [None]:
# Combine all features
FEATURE_COLUMNS = ["Aroma", "Flavor", "Aftertaste", "Acidity", "Body"]
MISSING_COLUMNS = ["Balance", "Uniformity", "Clean Cup", "Sweetness"]
shared_ls = np.hstack([rev_df[FEATURE_COLUMNS+MISSING_COLUMNS].to_numpy(), embeddings_reduced])

## Some downstream application

In [None]:
# Predict rating
X = shared_ls
y = rev_df['rating']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# XGBoost
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate
xgb_model.score(X_test, y_test)
