In [64]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%config InlineBackend.figure_format = 'retina'
sns.set_theme()

In [82]:
dtypes = {
    'country': 'category',
    'province': 'category',
    'variety': 'category',
    'region_1': 'category',
    'region_2': 'category',
    
    'winery': 'category',
    'designation': 'category',
    
    'description': 'str',

    'quality': 'float'
}
categorical_columns = [k for k,t in dtypes.items() if t=='category']
df = pd.read_csv('../datasets/competition_dataset/dev.tsv', sep='\t', dtype=dtypes,
                 usecols=dtypes.keys())

df = df.drop_duplicates().dropna(subset=['country', 'province'])

for col in categorical_columns:
    unk_cat = f"UNKNOWN_{col}"
    df[col] = df[col].cat.add_categories(unk_cat).fillna(unk_cat)

assert len(df) == 85025, "we droppet more that we should have"

df

Unnamed: 0,country,description,designation,province,region_1,region_2,variety,winery,quality
0,France,"A creamed pear wine, with an attractive tang o...",Brut Blanc de Blancs,Alsace,Crémant d'Alsace,UNKNOWN_region_2,Pinot Blanc,Lucien Albrecht,45.0
1,US,"Simple and dry, this Cabernet has modest black...",UNKNOWN_designation,California,Paso Robles,Central Coast,Cabernet Sauvignon,Castle Rock,31.0
2,US,"This lovely wine captures the floral, perfumed...",UNKNOWN_designation,Oregon,Willamette Valley,Willamette Valley,Gewürztraminer,Château Bianca,35.0
3,Portugal,"The aromas are the thing here, as so often wit...",UNKNOWN_designation,Alentejano,UNKNOWN_region_1,UNKNOWN_region_2,Touriga Nacional,Herdade do Esporão,41.0
4,Italy,"This is an interesting, outright strange wine ...",Natì,Southern Italy,Pompeiano,UNKNOWN_region_2,Coda di Volpe,Sorrentino,37.0
...,...,...,...,...,...,...,...,...,...
120730,France,"Moët's style, with its delicious forward fruit...",Brut,Champagne,Champagne,UNKNOWN_region_2,Champagne Blend,Moët & Chandon,59.0
120732,Portugal,"This is soft, young and fruity, with a dominat...",PV Ruby Port Collections,Port,UNKNOWN_region_1,UNKNOWN_region_2,Port,Barão de Vilar,42.0
120733,US,"Showing ripe peach, pineapple and honeysuckle ...",UNKNOWN_designation,California,Yountville,Napa,Chardonnay,Liparita,46.0
120735,US,A first release from this new Walla Walla wine...,UNKNOWN_designation,Washington,Walla Walla Valley (WA),Columbia Valley,Syrah,Delmas,58.0


In [83]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('quality', axis=1), df.quality, random_state=42)

In [84]:
from sklearn.preprocessing import OneHotEncoder

In [85]:
from sklearn.feature_extraction.text import CountVectorizer

In [86]:
from nltk.corpus import stopwords

In [87]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
column_trans = ColumnTransformer([
        ('text_trans', CountVectorizer(binary=True, stop_words=stopwords.words('english')), 'description'),
        ('cat_trans', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ],
    remainder='drop'
)

column_trans.fit_transform(X_train), column_trans.transformers[0][1].fit_transform(X_train.description), column_trans.transformers[1][1].fit_transform(X_train[categorical_columns])

(<63768x63822 sparse matrix of type '<class 'numpy.float64'>'
 	with 1992232 stored elements in Compressed Sparse Row format>,
 <63768x25990 sparse matrix of type '<class 'numpy.int64'>'
 	with 1545856 stored elements in Compressed Sparse Row format>,
 <63768x37832 sparse matrix of type '<class 'numpy.float64'>'
 	with 446376 stored elements in Compressed Sparse Row format>)

In [88]:
from sklearn.svm import LinearSVR

In [89]:
reg = Pipeline([
    ('column_trans', column_trans),
    ('reg', LinearSVR(random_state=42))
])

In [90]:
%%time
from sklearn.model_selection import cross_validate
scores = cross_validate(reg, X_train, y_train, return_estimator=True, scoring='r2')
print(f"Mean R² score: {scores['test_score'].mean():.3f} ± {scores['test_score'].std():.3f}")

Mean R² score: 0.684 ± 0.008
CPU times: user 20.1 s, sys: 17.8 ms, total: 20.2 s
Wall time: 20.2 s


---

In [91]:
df_eval = pd.read_csv('../datasets/competition_dataset/eval.tsv', sep='\t', dtype=dtypes, usecols=[k for k in dtypes.keys() if k != 'quality'])

df_eval = df_eval.dropna(subset=['country', 'province'])

for col in categorical_columns:
    unk_cat = f"UNKNOWN_{col}"
    df_eval[col] = df_eval[col].cat.add_categories(unk_cat).fillna(unk_cat)
df_eval

Unnamed: 0,country,description,designation,province,region_1,region_2,variety,winery
0,US,Strong blueberry and black-cherry aromas mesh ...,Alliage,California,Santa Cruz Mountains,Central Coast,Cabernet Sauvignon,Byington
1,Germany,"Lush and decadent, with intensely sweet notes ...",Graacher Himmelreich Vat 69 Eiswein,Mosel,UNKNOWN_region_1,UNKNOWN_region_2,Riesling,S.A. Prüm
2,Spain,"Leather, mint and wet-dog aromas along with an...",Cachito Mio,Northern Spain,Toro,UNKNOWN_region_2,Tinta de Toro,La Casa Maguila
3,France,"92-94 Barrel sample. Wood dominant, smooth tan...",Barrel sample,Bordeaux,Saint-Julien,UNKNOWN_region_2,Bordeaux-style Red Blend,Château Branaire-Ducru
4,France,"A lean, austere wine, maybe the result of the ...",Tradition,Southwest France,Cahors,UNKNOWN_region_2,Malbec,Château les Croisille
...,...,...,...,...,...,...,...,...
30181,US,This is very good in a Napa-does-Bordeaux styl...,Blueprint,California,Napa Valley,Napa,Bordeaux-style Red Blend,Lail
30182,US,You might mistake this for a good Dry Creek Zi...,UNKNOWN_designation,California,Dry Creek Valley,Sonoma,Cabernet Franc,Mounts
30183,US,"A Southern Rhône-style blend of Syrah, Grenach...",Cuvee Christie,California,Santa Barbara County,Central Coast,G-S-M,Tercero
30184,South Africa,"Fresh grass, vibrant citrus and tart gooseberr...",Polkadraai,Polkadraai Hills,UNKNOWN_region_1,UNKNOWN_region_2,Chenin Blanc-Sauvignon Blanc,Stellenbosch Hills


In [92]:
reg_eval = reg.fit(df.drop('quality', axis=1), df.quality)

In [93]:
reg.score(df.drop('quality', axis=1), df.quality)

0.8252458611717028

In [94]:
df_eval['Predicted'] = reg_eval.predict(df_eval)
df_eval['Predicted']

0        54.307863
1        51.019331
2        43.909276
3        62.537668
4        38.004879
           ...    
30181    57.464930
30182    49.589305
30183    49.260283
30184    29.920190
30185    35.530347
Name: Predicted, Length: 30186, dtype: float64

In [95]:
df_eval['Predicted'].to_csv('second.csv',index_label='Id')