In [1]:
import sys

sys.path.append('..')

%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import pickle

from datetime import datetime
from joblib import dump
from shutil import rmtree
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, PolynomialFeatures
from tempfile import mkdtemp

from src.helpers import PorterTokenizer

[nltk_data] Downloading package punkt to /home/nixwill/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nixwill/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train_df = pd.read_csv('../data/cleaned/train.csv')
train_df

Unnamed: 0,country,description,points,price,taster_name,points_group
0,Argentina,"Slightly gritty, nutty aromas indicate that th...",84,18.0,Michael Schachner,1
1,Portugal,Merlot and Syrah are added to Portuguese varie...,87,12.0,Roger Voss,2
2,France,Ripe mirabelle fruit plays on the nose and lea...,92,35.0,Anne Krebiehl MW,3
3,Italy,"This blend of 70% Sangiovese with Canaiolo, Co...",88,14.0,Kerin O’Keefe,2
4,Italy,"It opens with aromas of forest floor, porcini ...",88,28.0,Kerin O’Keefe,2
...,...,...,...,...,...,...
69907,France,"Light and only gently structured, this is a cr...",87,18.0,Roger Voss,2
69908,Germany,Veins of smoke and wax seep through fresh appl...,87,18.0,Anna Lee C. Iijima,2
69909,US,A combination of American and Hungarian oak le...,88,16.0,Anna Lee C. Iijima,2
69910,France,"Based in the heart of the Haut-Médoc, this pro...",84,20.0,Roger Voss,1


In [4]:
X_train_df = train_df[['price', 'country', 'taster_name', 'description']]
y_train_clf = train_df['points_group']
y_train_reg = train_df['points']

In [5]:
def make_custom_transformer():
    return make_column_transformer(
        (make_pipeline(
            PolynomialFeatures(2, include_bias=False),
            MinMaxScaler(feature_range=(0, 1)),
        ), ['price']),
        (OneHotEncoder(handle_unknown='ignore'), ['country', 'taster_name']),
        (TfidfVectorizer(
            tokenizer=PorterTokenizer(),
            max_df=0.7,
            min_df=7,
            ngram_range=(1, 1),
        ), 'description'),
    )

In [None]:
cachedir = mkdtemp()
pipe = Pipeline(
    [
        ('transformer', make_custom_transformer()),
        ('estimator', SGDClassifier(random_state=42)),
    ],
    memory=cachedir,
)

In [None]:
param_grid = dict(
    transformer__tfidfvectorizer__max_df=[0.7],
    transformer__tfidfvectorizer__min_df=[7, 0.01],
    transformer__tfidfvectorizer__ngram_range=[(1, 1)],
    estimator__loss=['log', 'modified_huber'],
    estimator__penalty=['l2', 'l1', 'elasticnet'],
    estimator__alpha=[0.0001, 0.001, 0.05],
    estimator__class_weight=['balanced', None],
    estimator__average=[True, False],
)

grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=4,
    n_jobs=-1,
    verbose=3,
    scoring=['f1_weighted', 'balanced_accuracy', 'accuracy'],
    refit='f1_weighted',
)

grid_search = grid_search.fit(X_train_df, y_train_clf)

In [None]:
dump(grid_search, f'../models/SGDC-{datetime.now().isoformat()}.joblib')
rmtree(cachedir)