In [64]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%config InlineBackend.figure_format = 'retina'
sns.set_theme()

In [65]:
dtypes = {
    'country': 'category',
    'province': 'category',
    'variety': 'category',
    'region_1': 'category',
    'region_2': 'category',
    
    'winery': 'category',
    
    'description': 'str',

    'quality': 'float'
}
categorical_columns = [k for k,t in dtypes.items() if t=='category']
df = pd.read_csv('../datasets/competition_dataset/dev.tsv', sep='\t', dtype=dtypes,
                 usecols=dtypes.keys())

df = df.drop_duplicates().dropna(subset=['country', 'province'])

for col in categorical_columns:
    unk_cat = f"UNKNOWN_{col}"
    df[col] = df[col].cat.add_categories(unk_cat).fillna(unk_cat)

assert len(df) == 85024, "we droppet more that we should have"

df

Unnamed: 0,country,description,province,region_1,region_2,variety,winery,quality
0,France,"A creamed pear wine, with an attractive tang o...",Alsace,Crémant d'Alsace,UNKNOWN_region_2,Pinot Blanc,Lucien Albrecht,45.0
1,US,"Simple and dry, this Cabernet has modest black...",California,Paso Robles,Central Coast,Cabernet Sauvignon,Castle Rock,31.0
2,US,"This lovely wine captures the floral, perfumed...",Oregon,Willamette Valley,Willamette Valley,Gewürztraminer,Château Bianca,35.0
3,Portugal,"The aromas are the thing here, as so often wit...",Alentejano,UNKNOWN_region_1,UNKNOWN_region_2,Touriga Nacional,Herdade do Esporão,41.0
4,Italy,"This is an interesting, outright strange wine ...",Southern Italy,Pompeiano,UNKNOWN_region_2,Coda di Volpe,Sorrentino,37.0
...,...,...,...,...,...,...,...,...
120730,France,"Moët's style, with its delicious forward fruit...",Champagne,Champagne,UNKNOWN_region_2,Champagne Blend,Moët & Chandon,59.0
120732,Portugal,"This is soft, young and fruity, with a dominat...",Port,UNKNOWN_region_1,UNKNOWN_region_2,Port,Barão de Vilar,42.0
120733,US,"Showing ripe peach, pineapple and honeysuckle ...",California,Yountville,Napa,Chardonnay,Liparita,46.0
120735,US,A first release from this new Walla Walla wine...,Washington,Walla Walla Valley (WA),Columbia Valley,Syrah,Delmas,58.0


In [66]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('quality', axis=1), df.quality, random_state=42)

In [67]:
from sklearn.preprocessing import OneHotEncoder

In [68]:
from sklearn.feature_extraction.text import CountVectorizer

In [69]:
from nltk.corpus import stopwords

In [70]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
column_trans = ColumnTransformer([
        ('text_trans', CountVectorizer(binary=True, stop_words=stopwords.words('english')), 'description'),
        ('cat_trans', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ],
    remainder='drop'
)

column_trans.fit_transform(X_train), column_trans.transformers[0][1].fit_transform(X_train.description), column_trans.transformers[1][1].fit_transform(X_train[categorical_columns])

(<63768x41069 sparse matrix of type '<class 'numpy.float64'>'
 	with 1928971 stored elements in Compressed Sparse Row format>,
 <63768x26000 sparse matrix of type '<class 'numpy.int64'>'
 	with 1546363 stored elements in Compressed Sparse Row format>,
 <63768x15069 sparse matrix of type '<class 'numpy.float64'>'
 	with 382608 stored elements in Compressed Sparse Row format>)

In [71]:
from sklearn.svm import LinearSVR

In [72]:
reg = Pipeline([
    ('column_trans', column_trans),
    ('reg', LinearSVR(random_state=42))
])

In [73]:
%%time
from sklearn.model_selection import cross_validate
scores = cross_validate(reg, X_train, y_train, return_estimator=True, scoring='r2')
print(f"Mean R² score: {scores['test_score'].mean():.3f} ± {scores['test_score'].std():.3f}")

Mean R² score: 0.655 ± 0.008
CPU times: user 17.9 s, sys: 27.3 ms, total: 17.9 s
Wall time: 17.9 s


---

### Moving to colab for NN training

In [15]:
from scipy.sparse import save_npz

In [20]:
X_train_trans = column_trans.fit_transform(X_train)
X_train_trans

<63765x28193 sparse matrix of type '<class 'numpy.float64'>'
	with 1864101 stored elements in Compressed Sparse Row format>

In [21]:
X_test_trans = column_trans.transform(X_test)
X_test_trans

<21255x28193 sparse matrix of type '<class 'numpy.float64'>'
	with 616673 stored elements in Compressed Sparse Row format>

In [22]:
save_npz('X_train_trans', X_train_trans)

In [26]:
save_npz('X_test_trans', X_test_trans)

In [24]:
np.save('y_train', y_train)

In [25]:
np.save('y_test', y_test)

---

In [74]:
df_eval = pd.read_csv('../datasets/competition_dataset/eval.tsv', sep='\t', dtype=dtypes, usecols=[k for k in dtypes.keys() if k != 'quality'])

df_eval = df_eval.dropna(subset=['country', 'province'])

for col in categorical_columns:
    unk_cat = f"UNKNOWN_{col}"
    df_eval[col] = df_eval[col].cat.add_categories(unk_cat).fillna(unk_cat)
df_eval

Unnamed: 0,country,description,province,region_1,region_2,variety,winery
0,US,Strong blueberry and black-cherry aromas mesh ...,California,Santa Cruz Mountains,Central Coast,Cabernet Sauvignon,Byington
1,Germany,"Lush and decadent, with intensely sweet notes ...",Mosel,UNKNOWN_region_1,UNKNOWN_region_2,Riesling,S.A. Prüm
2,Spain,"Leather, mint and wet-dog aromas along with an...",Northern Spain,Toro,UNKNOWN_region_2,Tinta de Toro,La Casa Maguila
3,France,"92-94 Barrel sample. Wood dominant, smooth tan...",Bordeaux,Saint-Julien,UNKNOWN_region_2,Bordeaux-style Red Blend,Château Branaire-Ducru
4,France,"A lean, austere wine, maybe the result of the ...",Southwest France,Cahors,UNKNOWN_region_2,Malbec,Château les Croisille
...,...,...,...,...,...,...,...
30181,US,This is very good in a Napa-does-Bordeaux styl...,California,Napa Valley,Napa,Bordeaux-style Red Blend,Lail
30182,US,You might mistake this for a good Dry Creek Zi...,California,Dry Creek Valley,Sonoma,Cabernet Franc,Mounts
30183,US,"A Southern Rhône-style blend of Syrah, Grenach...",California,Santa Barbara County,Central Coast,G-S-M,Tercero
30184,South Africa,"Fresh grass, vibrant citrus and tart gooseberr...",Polkadraai Hills,UNKNOWN_region_1,UNKNOWN_region_2,Chenin Blanc-Sauvignon Blanc,Stellenbosch Hills


In [75]:
reg_eval = reg.fit(df.drop('quality', axis=1), df.quality)

In [76]:
reg.score(df.drop('quality', axis=1), df.quality)

0.7775471683525556

In [77]:
df_eval['Predicted'] = reg_eval.predict(df_eval)
df_eval['Predicted']

0        56.532013
1        52.190918
2        43.717091
3        62.423754
4        39.317588
           ...    
30181    56.408184
30182    51.578867
30183    49.008359
30184    29.899439
30185    36.153135
Name: Predicted, Length: 30186, dtype: float64

In [78]:
df_eval['Predicted'].to_csv('second.csv',index_label='Id')