In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%config InlineBackend.figure_format = 'retina'
sns.set_theme()

In [4]:
dtypes = {
    'country': 'category',
    'province': 'category',
    'variety': 'category',
    'region_1': 'category',
    'region_2': 'category',
    
    'winery': 'category',
    'designation': 'category',
    
    'description': 'str',

    'quality': 'float'
}
categorical_columns = [k for k,t in dtypes.items() if t=='category']
df = pd.read_csv('../datasets/competition_dataset/dev.tsv', sep='\t', dtype=dtypes,
                 usecols=dtypes.keys())

df = df.drop_duplicates().dropna(subset=['country', 'province'])

for col in categorical_columns:
    unk_cat = f"UNKNOWN_{col}"
    df[col] = df[col].cat.add_categories(unk_cat).fillna(unk_cat)

assert len(df) == 85025, "we droppet more that we should have"

df.head()

Unnamed: 0,country,description,designation,province,region_1,region_2,variety,winery,quality
0,France,"A creamed pear wine, with an attractive tang o...",Brut Blanc de Blancs,Alsace,Crémant d'Alsace,UNKNOWN_region_2,Pinot Blanc,Lucien Albrecht,45.0
1,US,"Simple and dry, this Cabernet has modest black...",UNKNOWN_designation,California,Paso Robles,Central Coast,Cabernet Sauvignon,Castle Rock,31.0
2,US,"This lovely wine captures the floral, perfumed...",UNKNOWN_designation,Oregon,Willamette Valley,Willamette Valley,Gewürztraminer,Château Bianca,35.0
3,Portugal,"The aromas are the thing here, as so often wit...",UNKNOWN_designation,Alentejano,UNKNOWN_region_1,UNKNOWN_region_2,Touriga Nacional,Herdade do Esporão,41.0
4,Italy,"This is an interesting, outright strange wine ...",Natì,Southern Italy,Pompeiano,UNKNOWN_region_2,Coda di Volpe,Sorrentino,37.0


---

In [5]:
from gensim.sklearn_api import W2VTransformer

In [48]:
from nltk import word_tokenize

In [49]:
%%time
wine_corpus = df.description.append(df_eval.description).map(lambda s: word_tokenize(s.lower())).tolist()

CPU times: user 53.3 s, sys: 423 ms, total: 53.8 s
Wall time: 53.8 s


In [50]:
%%time
w2v_trans = W2VTransformer(size=300, min_count=1, seed=1).fit(wine_corpus)

CPU times: user 1min 9s, sys: 596 ms, total: 1min 10s
Wall time: 30.5 s


In [51]:
%%time
# TODO(Andrea): This is terrible, handle out-of-vocabulary words the correct way    
df['wv'] = df.description.map(lambda s: word_tokenize(s.lower())).map(w2v_trans.transform).map(lambda v: v.sum(axis=0))

CPU times: user 50.7 s, sys: 166 ms, total: 50.9 s
Wall time: 50.9 s


In [52]:
w2v_trans.gensim_model.most_similar('acid')

  """Entry point for launching an IPython kernel.


[('acids', 0.6941390037536621),
 ('acidity', 0.6646442413330078),
 ('acidic', 0.5058441162109375),
 ('proportion', 0.4843388795852661),
 ('grip', 0.48192131519317627),
 ('octane', 0.47412410378456116),
 ('lemony-clean', 0.4724220037460327),
 ('alcohol', 0.4711449146270752),
 ('toned', 0.4631137251853943),
 ('crispness', 0.4410700798034668)]

---

In [53]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('quality', axis=1), df.quality, random_state=42)

In [54]:
from sklearn.preprocessing import OneHotEncoder

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
from nltk.corpus import stopwords

In [57]:
from sklearn.preprocessing import FunctionTransformer

In [58]:
%%time
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
column_trans = ColumnTransformer([
        ('text_trans', TfidfVectorizer(stop_words=stopwords.words('english')), 'description'),
        ('w2v_trans', FunctionTransformer(func=lambda X: np.stack(X.map(lambda x: x / 100)), check_inverse=False), 'wv'),
        ('cat_trans', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='drop'
)

# column_trans.fit_transform(X_train.head()).shape, \
# column_trans.transformers[0][1].fit_transform(X_train.description.head()).shape, \
# column_trans.transformers[1][1].fit_transform(X_train.wv.head()).shape, \
# column_trans.transformers[2][1].fit_transform(X_train[categorical_columns].head()).shape

CPU times: user 749 µs, sys: 79 µs, total: 828 µs
Wall time: 504 µs


In [59]:
from sklearn.compose import TransformedTargetRegressor

In [60]:
from sklearn.linear_model import Ridge

In [61]:
from sklearn.preprocessing import StandardScaler

In [62]:
from sklearn.model_selection import cross_validate

reg = Pipeline(
    steps=[
        ('column_trans', column_trans),
        ('reg', Ridge(random_state=42))
    ]
)

In [63]:
scores = cross_validate(reg, X_train, y_train, return_estimator=True, scoring='r2', n_jobs=-1, verbose=True)
print(f"Mean R² score: {scores['test_score'].mean():.3f} ± {scores['test_score'].std():.3f}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Mean R² score: 0.740 ± 0.010


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   54.7s finished


---

In [64]:
df_eval = pd.read_csv('../datasets/competition_dataset/eval.tsv', sep='\t', dtype=dtypes, usecols=[k for k in dtypes.keys() if k != 'quality'])

df_eval = df_eval.dropna(subset=['country', 'province'])

df_eval['wv'] = df_eval.description.map(lambda s: word_tokenize(s.lower())).map(w2v_trans.transform).map(lambda v: v.sum(axis=0))

for col in categorical_columns:
    unk_cat = f"UNKNOWN_{col}"
    df_eval[col] = df_eval[col].cat.add_categories(unk_cat).fillna(unk_cat)
df_eval

Unnamed: 0,country,description,designation,province,region_1,region_2,variety,winery,wv
0,US,Strong blueberry and black-cherry aromas mesh ...,Alliage,California,Santa Cruz Mountains,Central Coast,Cabernet Sauvignon,Byington,"[8.146894, 4.923196, -13.609875, -25.412968, -..."
1,Germany,"Lush and decadent, with intensely sweet notes ...",Graacher Himmelreich Vat 69 Eiswein,Mosel,UNKNOWN_region_1,UNKNOWN_region_2,Riesling,S.A. Prüm,"[-2.2142732, 11.474023, -9.733415, -25.079437,..."
2,Spain,"Leather, mint and wet-dog aromas along with an...",Cachito Mio,Northern Spain,Toro,UNKNOWN_region_2,Tinta de Toro,La Casa Maguila,"[9.008783, 5.7527194, -14.74435, -32.200657, 1..."
3,France,"92-94 Barrel sample. Wood dominant, smooth tan...",Barrel sample,Bordeaux,Saint-Julien,UNKNOWN_region_2,Bordeaux-style Red Blend,Château Branaire-Ducru,"[-4.668662, -2.1214767, -1.6580575, -11.93123,..."
4,France,"A lean, austere wine, maybe the result of the ...",Tradition,Southwest France,Cahors,UNKNOWN_region_2,Malbec,Château les Croisille,"[6.66781, -1.803279, 0.09208423, -21.971687, 8..."
...,...,...,...,...,...,...,...,...,...
30181,US,This is very good in a Napa-does-Bordeaux styl...,Blueprint,California,Napa Valley,Napa,Bordeaux-style Red Blend,Lail,"[-0.7932334, -3.2884083, -3.981152, -12.333408..."
30182,US,You might mistake this for a good Dry Creek Zi...,UNKNOWN_designation,California,Dry Creek Valley,Sonoma,Cabernet Franc,Mounts,"[-1.826087, 8.81035, -4.528069, -31.21045, 9.4..."
30183,US,"A Southern Rhône-style blend of Syrah, Grenach...",Cuvee Christie,California,Santa Barbara County,Central Coast,G-S-M,Tercero,"[-11.101473, -0.344439, -8.516296, -30.760334,..."
30184,South Africa,"Fresh grass, vibrant citrus and tart gooseberr...",Polkadraai,Polkadraai Hills,UNKNOWN_region_1,UNKNOWN_region_2,Chenin Blanc-Sauvignon Blanc,Stellenbosch Hills,"[-5.795786, 0.09900263, -6.9640703, -23.07708,..."


In [65]:
reg.fit(X_train, y_train).score(X_test, y_test)

0.7505447771190676

In [66]:
reg_eval = reg.fit(df.drop('quality', axis=1), df.quality)
df_eval['Predicted'] = reg_eval.predict(df_eval)
df_eval['Predicted']

0        56.453624
1        55.724543
2        40.932420
3        61.966654
4        38.261480
           ...    
30181    59.898555
30182    49.528360
30183    49.437818
30184    28.792577
30185    36.115866
Name: Predicted, Length: 30186, dtype: float64

In [67]:
df_eval['Predicted'].to_csv('second.csv',index_label='Id')