In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%config InlineBackend.figure_format = 'retina'
sns.set_theme()

In [2]:
dtypes = {
    'country': 'category',
    'province': 'category',
    'variety': 'category',
    'region_1': 'category',
    'region_2': 'category',
    
    'description': 'str',

    'quality': 'float'
}
categorical_columns = [k for k,t in dtypes.items() if t=='category']
df = pd.read_csv('../datasets/competition_dataset/dev.tsv', sep='\t', dtype=dtypes,
                 usecols=dtypes.keys())

df = df.drop_duplicates().dropna(subset=['country', 'province'])

for col in categorical_columns:
    unk_cat = f"UNKNOWN_{col}"
    df[col] = df[col].cat.add_categories(unk_cat).fillna(unk_cat)

assert len(df) == 85020, "we droppet more that we should have"

df

Unnamed: 0,country,description,province,region_1,region_2,variety,quality
0,France,"A creamed pear wine, with an attractive tang o...",Alsace,Crémant d'Alsace,UNKNOWN_region_2,Pinot Blanc,45.0
1,US,"Simple and dry, this Cabernet has modest black...",California,Paso Robles,Central Coast,Cabernet Sauvignon,31.0
2,US,"This lovely wine captures the floral, perfumed...",Oregon,Willamette Valley,Willamette Valley,Gewürztraminer,35.0
3,Portugal,"The aromas are the thing here, as so often wit...",Alentejano,UNKNOWN_region_1,UNKNOWN_region_2,Touriga Nacional,41.0
4,Italy,"This is an interesting, outright strange wine ...",Southern Italy,Pompeiano,UNKNOWN_region_2,Coda di Volpe,37.0
...,...,...,...,...,...,...,...
120730,France,"Moët's style, with its delicious forward fruit...",Champagne,Champagne,UNKNOWN_region_2,Champagne Blend,59.0
120732,Portugal,"This is soft, young and fruity, with a dominat...",Port,UNKNOWN_region_1,UNKNOWN_region_2,Port,42.0
120733,US,"Showing ripe peach, pineapple and honeysuckle ...",California,Yountville,Napa,Chardonnay,46.0
120735,US,A first release from this new Walla Walla wine...,Washington,Walla Walla Valley (WA),Columbia Valley,Syrah,58.0


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('quality', axis=1), df.quality, random_state=42)

In [4]:
from sklearn.preprocessing import OneHotEncoder

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
column_trans = ColumnTransformer([
        ('text_trans', CountVectorizer(binary=True), 'description'),
        ('cat_trans', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ],
    remainder='drop'
)

column_trans.fit_transform(X_train), column_trans.transformers[0][1].fit_transform(X_train.description), column_trans.transformers[1][1].fit_transform(X_train[categorical_columns])

(<63765x28328 sparse matrix of type '<class 'numpy.float64'>'
 	with 2498922 stored elements in Compressed Sparse Row format>,
 <63765x26126 sparse matrix of type '<class 'numpy.int64'>'
 	with 2180097 stored elements in Compressed Sparse Row format>,
 <63765x2202 sparse matrix of type '<class 'numpy.float64'>'
 	with 318825 stored elements in Compressed Sparse Row format>)

In [7]:
from sklearn.svm import LinearSVR

In [22]:
from lightgbm.sklearn import LGBMRegressor

In [25]:
reg = Pipeline([
    ('column_trans', column_trans),
    ('lgbm_reg', LinearSVR(random_state=42))
])

In [26]:
%%time
from sklearn.model_selection import cross_validate
scores = cross_validate(reg, X_train, y_train, return_estimator=True, scoring='r2')
print(f"Mean R² score: {scores['test_score'].mean():.3f} ± {scores['test_score'].std():.3f}")

Mean R² score: 0.595 ± 0.004
CPU times: user 21.3 s, sys: 47 ms, total: 21.4 s
Wall time: 21.4 s


---

In [27]:
df_eval = pd.read_csv('../datasets/competition_dataset/eval.tsv', sep='\t', dtype=dtypes, usecols=[k for k in dtypes.keys() if k != 'quality'])

df_eval = df_eval.dropna(subset=['country', 'province'])

for col in categorical_columns:
    unk_cat = f"UNKNOWN_{col}"
    df_eval[col] = df_eval[col].cat.add_categories(unk_cat).fillna(unk_cat)
df_eval

Unnamed: 0,country,description,province,region_1,region_2,variety
0,US,Strong blueberry and black-cherry aromas mesh ...,California,Santa Cruz Mountains,Central Coast,Cabernet Sauvignon
1,Germany,"Lush and decadent, with intensely sweet notes ...",Mosel,UNKNOWN_region_1,UNKNOWN_region_2,Riesling
2,Spain,"Leather, mint and wet-dog aromas along with an...",Northern Spain,Toro,UNKNOWN_region_2,Tinta de Toro
3,France,"92-94 Barrel sample. Wood dominant, smooth tan...",Bordeaux,Saint-Julien,UNKNOWN_region_2,Bordeaux-style Red Blend
4,France,"A lean, austere wine, maybe the result of the ...",Southwest France,Cahors,UNKNOWN_region_2,Malbec
...,...,...,...,...,...,...
30181,US,This is very good in a Napa-does-Bordeaux styl...,California,Napa Valley,Napa,Bordeaux-style Red Blend
30182,US,You might mistake this for a good Dry Creek Zi...,California,Dry Creek Valley,Sonoma,Cabernet Franc
30183,US,"A Southern Rhône-style blend of Syrah, Grenach...",California,Santa Barbara County,Central Coast,G-S-M
30184,South Africa,"Fresh grass, vibrant citrus and tart gooseberr...",Polkadraai Hills,UNKNOWN_region_1,UNKNOWN_region_2,Chenin Blanc-Sauvignon Blanc


In [28]:
reg_eval = reg.fit(df.drop('quality', axis=1), df.quality)

In [29]:
reg.score(df.drop('quality', axis=1), df.quality)

0.696848456225017

In [30]:
df_eval['Predicted'] = reg_eval.predict(df_eval)
df_eval['Predicted']

0        55.962763
1        51.551031
2        46.511946
3        61.825167
4        39.619033
           ...    
30181    53.314600
30182    50.369073
30183    50.847469
30184    28.487339
30185    33.996703
Name: Predicted, Length: 30186, dtype: float64

In [31]:
df_eval['Predicted'].to_csv('second.csv',index_label='Id')