In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%config InlineBackend.figure_format = 'retina'
sns.set_theme()

In [2]:
df = pd.read_csv('../datasets/competition_dataset/dev.tsv', sep='\t', dtype={
    'country': 'category',
    'province': 'category',
    'variety': 'category',
    'description': 'str',
    'quality': 'float'
}, usecols=['country', 'province', 'variety', 'description', 'quality'])

df = df.drop_duplicates()
df = df.dropna()

df

Unnamed: 0,country,description,province,variety,quality
0,France,"A creamed pear wine, with an attractive tang o...",Alsace,Pinot Blanc,45.0
1,US,"Simple and dry, this Cabernet has modest black...",California,Cabernet Sauvignon,31.0
2,US,"This lovely wine captures the floral, perfumed...",Oregon,Gewürztraminer,35.0
3,Portugal,"The aromas are the thing here, as so often wit...",Alentejano,Touriga Nacional,41.0
4,Italy,"This is an interesting, outright strange wine ...",Southern Italy,Coda di Volpe,37.0
...,...,...,...,...,...
120730,France,"Moët's style, with its delicious forward fruit...",Champagne,Champagne Blend,59.0
120732,Portugal,"This is soft, young and fruity, with a dominat...",Port,Port,42.0
120733,US,"Showing ripe peach, pineapple and honeysuckle ...",California,Chardonnay,46.0
120735,US,A first release from this new Walla Walla wine...,Washington,Syrah,58.0


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['country', 'province', 'variety', 'description']], df.quality, random_state=42)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVR

text_trans = ColumnTransformer(
    [('description_text', CountVectorizer(binary=True), 'description')],
    remainder='drop'
)

text_reg = Pipeline([
    ('text_trans', text_trans),
    ('text_reg', LinearSVR(random_state=42))
])

from sklearn.model_selection import cross_validate
scores = cross_validate(text_reg, X_train, y_train, return_estimator=True, scoring='r2')
scores

{'fit_time': array([3.06798673, 2.94971895, 3.01315403, 3.10678291, 3.05091596]),
 'score_time': array([0.38584256, 0.37674284, 0.40413952, 0.40283298, 0.40266943]),
 'estimator': [Pipeline(steps=[('text_trans',
                   ColumnTransformer(transformers=[('description_text',
                                                    CountVectorizer(binary=True),
                                                    'description')])),
                  ('text_reg', LinearSVR(random_state=42))]),
  Pipeline(steps=[('text_trans',
                   ColumnTransformer(transformers=[('description_text',
                                                    CountVectorizer(binary=True),
                                                    'description')])),
                  ('text_reg', LinearSVR(random_state=42))]),
  Pipeline(steps=[('text_trans',
                   ColumnTransformer(transformers=[('description_text',
                                                    CountVectorizer(binary=T

In [5]:
print(f"Mean R² score: {scores['test_score'].mean():.3f} ± {scores['test_score'].std():.3f}")

Mean R² score: 0.444 ± 0.002


In [6]:
from sklearn.linear_model import LinearRegression

In [8]:
import lightgbm as lgb

from sklearn.preprocessing import FunctionTransformer

def recast_to_dataframe(i):
    """
    sklearns transforms data to ndarrays while we want pandas dataframes to
    take advantage of lightGBM's automatica categorical handling
    """
    df = pd.DataFrame(i, columns=['country', 'province', 'variety'])
    df['country'] = df['country'].astype('category')
    df['province'] = df['province'].astype('category')
    df['variety'] = df['variety'].astype('category')
    
    return df

cat_trans = ColumnTransformer(
    [('lgbm_reg', 'passthrough', ['country', 'province', 'variety'])],
    remainder='drop'
)

cat_reg = Pipeline([
    ('cat_trans', cat_trans),
    ('ft', FunctionTransformer(recast_to_dataframe)),
    ('cat_reg', lgb.sklearn.LGBMRegressor(random_state=42))
])

scores = cross_validate(cat_reg, X_train, y_train, return_estimator=True, scoring='r2')
scores

{'fit_time': array([0.20738673, 0.20652461, 0.23117685, 0.19712734, 0.1928699 ]),
 'score_time': array([0.06114721, 0.05780506, 0.06314898, 0.05527067, 0.05864882]),
 'estimator': [Pipeline(steps=[('cat_trans',
                   ColumnTransformer(transformers=[('lgbm_reg', 'passthrough',
                                                    ['country', 'province',
                                                     'variety'])])),
                  ('ft',
                   FunctionTransformer(func=<function recast_to_dataframe at 0x7fd456b51950>)),
                  ('cat_reg', LGBMRegressor(random_state=42))]),
  Pipeline(steps=[('cat_trans',
                   ColumnTransformer(transformers=[('lgbm_reg', 'passthrough',
                                                    ['country', 'province',
                                                     'variety'])])),
                  ('ft',
                   FunctionTransformer(func=<function recast_to_dataframe at 0x7fd456b51950>)),
  

In [10]:
print(f"Mean R² score: {scores['test_score'].mean():.3f} ± {scores['test_score'].std():.3f}")

Mean R² score: 0.352 ± 0.006


---

In [11]:
from sklearn.ensemble import VotingRegressor

In [16]:
ereg = VotingRegressor(estimators=[('text', text_reg), ('cat', cat_reg)])

In [17]:
scores = cross_validate(ereg, X_train, y_train, return_estimator=True, scoring='r2')
scores
print(f"Mean R² score: {scores['test_score'].mean():.3f} ± {scores['test_score'].std():.3f}")

Mean R² score: 0.506 ± 0.003


In [18]:
scores

{'fit_time': array([3.37833261, 3.18913198, 3.13237405, 3.25398684, 3.22822571]),
 'score_time': array([0.46169353, 0.45339704, 0.44248343, 0.45214295, 0.45767045]),
 'estimator': [VotingRegressor(estimators=[('text',
                               Pipeline(steps=[('text_trans',
                                                ColumnTransformer(transformers=[('description_text',
                                                                                 CountVectorizer(binary=True),
                                                                                 'description')])),
                                               ('text_reg',
                                                LinearSVR(random_state=42))])),
                              ('cat',
                               Pipeline(steps=[('cat_trans',
                                                ColumnTransformer(transformers=[('lgbm_reg',
                                                                            

---

In [30]:
df_eval = pd.read_csv('../datasets/competition_dataset/eval.tsv', sep='\t', dtype={
    'country': 'category',
    'province': 'category',
    'variety': 'category',
    'description': 'str',
}, usecols=['country', 'province', 'variety', 'description'])

# df_eval = df_eval.drop_duplicates()
# df_eval = df_eval.dropna()

df_eval

Unnamed: 0,country,description,province,variety
0,US,Strong blueberry and black-cherry aromas mesh ...,California,Cabernet Sauvignon
1,Germany,"Lush and decadent, with intensely sweet notes ...",Mosel,Riesling
2,Spain,"Leather, mint and wet-dog aromas along with an...",Northern Spain,Tinta de Toro
3,France,"92-94 Barrel sample. Wood dominant, smooth tan...",Bordeaux,Bordeaux-style Red Blend
4,France,"A lean, austere wine, maybe the result of the ...",Southwest France,Malbec
...,...,...,...,...
30181,US,This is very good in a Napa-does-Bordeaux styl...,California,Bordeaux-style Red Blend
30182,US,You might mistake this for a good Dry Creek Zi...,California,Cabernet Franc
30183,US,"A Southern Rhône-style blend of Syrah, Grenach...",California,G-S-M
30184,South Africa,"Fresh grass, vibrant citrus and tart gooseberr...",Polkadraai Hills,Chenin Blanc-Sauvignon Blanc


In [32]:
ereg_eval = ereg.fit(df[['country', 'province', 'variety', 'description']], df.quality)

In [55]:
ereg.score(df[['country', 'province', 'variety', 'description']], df.quality)

0.5644612074583147

In [52]:
df_eval['Predicted'] = ereg_eval.predict(df_eval[['country', 'province', 'variety', 'description']])
df_eval['Predicted']

0        54.610976
1        48.784152
2        44.493817
3        56.156314
4        44.939472
           ...    
30181    54.481520
30182    49.985082
30183    50.247771
30184    36.613738
30185    33.635470
Name: Predicted, Length: 30186, dtype: float64

In [54]:
df_eval['Predicted'].to_csv('first.csv',index_label='Id')