In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%config InlineBackend.figure_format = 'retina'
sns.set_theme()

In [2]:
dtypes = {
    'country': 'category',
    'province': 'category',
    'variety': 'category',
    'region_1': 'category',
    'region_2': 'category',
    
    'description': 'str',

    'quality': 'float'
}
categorical_columns = [k for k,t in dtypes.items() if t=='category']
df = pd.read_csv('../datasets/competition_dataset/dev.tsv', sep='\t', dtype=dtypes,
                 usecols=dtypes.keys())

df = df.drop_duplicates()

df

Unnamed: 0,country,description,province,region_1,region_2,variety,quality
0,France,"A creamed pear wine, with an attractive tang o...",Alsace,Crémant d'Alsace,,Pinot Blanc,45.0
1,US,"Simple and dry, this Cabernet has modest black...",California,Paso Robles,Central Coast,Cabernet Sauvignon,31.0
2,US,"This lovely wine captures the floral, perfumed...",Oregon,Willamette Valley,Willamette Valley,Gewürztraminer,35.0
3,Portugal,"The aromas are the thing here, as so often wit...",Alentejano,,,Touriga Nacional,41.0
4,Italy,"This is an interesting, outright strange wine ...",Southern Italy,Pompeiano,,Coda di Volpe,37.0
...,...,...,...,...,...,...,...
120730,France,"Moët's style, with its delicious forward fruit...",Champagne,Champagne,,Champagne Blend,59.0
120732,Portugal,"This is soft, young and fruity, with a dominat...",Port,,,Port,42.0
120733,US,"Showing ripe peach, pineapple and honeysuckle ...",California,Yountville,Napa,Chardonnay,46.0
120735,US,A first release from this new Walla Walla wine...,Washington,Walla Walla Valley (WA),Columbia Valley,Syrah,58.0


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('quality', axis=1), df.quality, random_state=42)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVR

text_trans = ColumnTransformer(
    [('description_text', CountVectorizer(binary=True), 'description')],
    remainder='drop'
)

text_reg = Pipeline([
    ('text_trans', text_trans),
    ('text_reg', LinearSVR(random_state=42))
])

In [5]:
from sklearn.model_selection import cross_validate
scores = cross_validate(text_reg, X_train, y_train, return_estimator=True, scoring='r2')
print(f"Mean R² score: {scores['test_score'].mean():.3f} ± {scores['test_score'].std():.3f}")

Mean R² score: 0.447 ± 0.004


In [6]:
import lightgbm as lgb

from sklearn.preprocessing import FunctionTransformer

def recast_to_dataframe(i):
    """
    sklearns transforms data to ndarrays while we want pandas dataframes to
    take advantage of lightGBM's automatica categorical handling
    """
    df = pd.DataFrame(i, columns=categorical_columns)
    for col in categorical_columns:
        df[col] = df[col].astype('category')
    
    return df

cat_trans = ColumnTransformer(
    [('lgbm_reg', 'passthrough', categorical_columns)],
    remainder='drop'
)

cat_reg = Pipeline([
    ('cat_trans', cat_trans),
    ('ft', FunctionTransformer(recast_to_dataframe)),
    ('cat_reg', lgb.sklearn.LGBMRegressor(random_state=42))
])

In [7]:
scores = cross_validate(cat_reg, X_train, y_train, return_estimator=True, scoring='r2')
print(f"Mean R² score: {scores['test_score'].mean():.3f} ± {scores['test_score'].std():.3f}")

Mean R² score: 0.502 ± 0.006


---

In [8]:
from sklearn.ensemble import VotingRegressor

In [9]:
ereg = VotingRegressor(estimators=[('text', text_reg), ('cat', cat_reg)])

In [10]:
scores = cross_validate(ereg, X_train, y_train, return_estimator=True, scoring='r2')
print(f"Mean R² score: {scores['test_score'].mean():.3f} ± {scores['test_score'].std():.3f}")

Mean R² score: 0.581 ± 0.002


---

In [11]:
df_eval = pd.read_csv('../datasets/competition_dataset/eval.tsv', sep='\t', dtype=dtypes, usecols=[k for k in dtypes.keys() if k != 'quality'])
df_eval

Unnamed: 0,country,description,province,region_1,region_2,variety
0,US,Strong blueberry and black-cherry aromas mesh ...,California,Santa Cruz Mountains,Central Coast,Cabernet Sauvignon
1,Germany,"Lush and decadent, with intensely sweet notes ...",Mosel,,,Riesling
2,Spain,"Leather, mint and wet-dog aromas along with an...",Northern Spain,Toro,,Tinta de Toro
3,France,"92-94 Barrel sample. Wood dominant, smooth tan...",Bordeaux,Saint-Julien,,Bordeaux-style Red Blend
4,France,"A lean, austere wine, maybe the result of the ...",Southwest France,Cahors,,Malbec
...,...,...,...,...,...,...
30181,US,This is very good in a Napa-does-Bordeaux styl...,California,Napa Valley,Napa,Bordeaux-style Red Blend
30182,US,You might mistake this for a good Dry Creek Zi...,California,Dry Creek Valley,Sonoma,Cabernet Franc
30183,US,"A Southern Rhône-style blend of Syrah, Grenach...",California,Santa Barbara County,Central Coast,G-S-M
30184,South Africa,"Fresh grass, vibrant citrus and tart gooseberr...",Polkadraai Hills,,,Chenin Blanc-Sauvignon Blanc


In [12]:
ereg_eval = ereg.fit(df.drop('quality', axis=1), df.quality)

In [13]:
ereg.score(df.drop('quality', axis=1), df.quality)

0.6403832822720104

In [14]:
df_eval['Predicted'] = ereg_eval.predict(df_eval)
df_eval['Predicted']

0        55.445311
1        48.750638
2        44.458576
3        61.342019
4        45.432356
           ...    
30181    56.146390
30182    50.049736
30183    51.096368
30184    36.790033
30185    35.494700
Name: Predicted, Length: 30186, dtype: float64

In [15]:
df_eval['Predicted'].to_csv('second.csv',index_label='Id')