In [1]:
__author__ = "Niraj Jayant"
__version__ = "CS224u, Stanford, Spring 2020"

In [11]:
# imports
import os
import pandas as pd
import numpy as np
import string
from operator import itemgetter
from collections import Counter, OrderedDict

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier

import sst
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from torch_rnn_classifier import TorchRNNClassifier, TorchRNNClassifierModel
from torch_rnn_classifier import TorchRNNClassifier
from sklearn.metrics import classification_report
import utils
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from collections import Counter, defaultdict


from matplotlib import pyplot as plt

In [3]:
# constants used in this project
WINE_SRC_FILENAME = os.path.join(
    "data", "wine-reviews", "winemag-data-130k-v2.csv")
MIN_WINE_REVIEWS=100 # potentially change
GLOVE_6B_50D_PATH = os.path.join(
    "data", "glove.6B", "glove.6B.50d.txt")
encoding="utf-8"


In [4]:
# for the initial data frame, we are going to be using just the description and the variety
col_list=['description', 'variety']
baseline_df = pd.read_csv(WINE_SRC_FILENAME, usecols=col_list)

In [5]:
variety_mapping = {'Shiraz': 'Syrah', 
                   'Pinot Gris': 'Pinot Grigio', 'Pinot Blanc': 'Pinot Grigio',  'Pinot Bianco' : 'Pinot Grigio', 
                   'Garnacha': 'Grenache',
                   'Alvarinho' : 'Albariño',
                    'Muscat' : 'Moscato',
                   'Glera' : 'Prosecco',
                   'Sauvignon': 'Sauvignon Blanc',
                   'Blaufränkisch': 'Gamay',
                   'Primitivo': 'Zinfandel',
                   'Pinot Nero': 'Pinot Noir',
                   'Tinta de Toro': 'Tempranillo', 'Tinto Fino': 'Tempranillo',
                    'Monastrell': 'Mourvèdre',
                   'Alvarinho': 'Albariño',
                   'Rosato': 'Rosé',
                  }

def consolidate_varieties(variety_name):
    if variety_name in variety_mapping:
        return variety_mapping[variety_name]
    else:
        return variety_name

# clean data is the entry point to cleaning our data frame. It removes low review wines, blends, consolidates varietals
# note: for now, this wont do anything for descriptions. We will explore various improvements to descriptions
def clean_data(input_df):
    # assume caller does not want modified data - return new_copy
    df = input_df.copy()
    df = df[df.groupby('variety')['variety'].transform('count').ge(MIN_WINE_REVIEWS)]
    filters = ['Blend', 'Cabernet Sauvignon-Merlot', 'Cabernet Sauvignon-Syrah', 'Meritage', 'G-S-M']
    for filtered_variety in filters:
        df = df[~df.variety.str.contains(filtered_variety)]
    df['variety'] = df['variety'].apply(consolidate_varieties)
    return df


In [6]:
cleansed_df = clean_data(baseline_df)
print(cleansed_df.shape)
# expectation that we have 96,433 examples here

(96433, 2)


In [31]:
# Baseline - use the DummyClassifier with no additional cleaning
skf = StratifiedKFold(n_splits=5)
X = cleansed_df['description']
y = cleansed_df['variety']
dummy_clf = DummyClassifier(strategy="uniform")

cross_validate(dummy_clf, X, y, scoring='f1_macro', cv=skf,return_train_score=True)

{'fit_time': array([0.04650998, 0.0445559 , 0.04517794, 0.04417515, 0.04435921]),
 'score_time': array([0.09912896, 0.09586215, 0.10755181, 0.09689999, 0.09530878]),
 'test_score': array([0.00958214, 0.0093852 , 0.01049441, 0.00993733, 0.01040661]),
 'train_score': array([0.00960455, 0.01070873, 0.01004368, 0.00946385, 0.01033728])}

In [8]:
# Okay - that was pretty terrible. Now lets go ahead and do some cleansing of data as well
def cleanse_descriptions(input_df):
    df = input_df.copy()
    df['description'] = df['description'].apply(normalize_text)
    return df

stop_words = set(stopwords.words('english')) 

punctuation_table = str.maketrans({key: None for key in string.punctuation})
sno = SnowballStemmer('english')

def normalize_text(raw_text):
    try:
        word_list = word_tokenize(raw_text)
        normalized_sentence = []
        for w in word_list:
            try:
                w = str(w)
                lower_case_word = str.lower(w)
                stemmed_word = sno.stem(lower_case_word)
                no_punctuation = stemmed_word.translate(punctuation_table)
                if len(no_punctuation) > 1 and no_punctuation not in stop_words:
                    normalized_sentence.append(no_punctuation)
            except:
                continue
        return normalized_sentence
    except:
        return ''



In [9]:
cleansed_df = cleanse_descriptions(cleansed_df)

In [10]:
cleansed_df.description.head(50)

1     [ripe, fruiti, wine, smooth, still, structur, ...
2     [tart, snappi, flavor, lime, flesh, rind, domi...
3     [pineappl, rind, lemon, pith, orang, blossom, ...
4     [much, like, regular, bottl, 2012, come, acros...
7     [dri, restrain, wine, offer, spice, profus, ba...
8     [savori, dri, thyme, note, accent, sunnier, fl...
9     [great, depth, flavor, fresh, appl, pear, frui...
10    [soft, suppl, plum, envelop, oaki, structur, c...
11    [dri, wine, veri, spici, tight, taut, textur, ...
12    [slight, reduc, wine, offer, chalki, tannic, b...
13    [domin, oak, oakdriven, aroma, includ, roast, ...
14    [build, 150, year, six, generat, winemak, trad...
15    [zesti, orang, peel, appl, note, abound, sprig...
16    [bake, plum, molass, balsam, vinegar, cheesi, ...
17    [raw, blackcherri, aroma, direct, simpl, good,...
21    [sleek, mix, tart, berri, stem, herb, along, h...
23    [wine, geneseo, district, offer, aroma, sour, ...
24    [aroma, prune, blackcurr, toast, oak, carr

In [25]:
import struct 

glove_small = {}
all_words = set(w for words in X for w in words)
with open(GLOVE_6B_50D_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        if (word in all_words):
            nums=np.array(parts[1:], dtype=np.float32)
            glove_small[word] = nums

In [26]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

    
# and a tf-idf version of the same
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [32]:
X = cleansed_df.description

In [19]:
model = Word2Vec(X, size=100, window=5, min_count=5, workers=2)
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

  


In [21]:
mult_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])

In [22]:
cross_validate(mult_nb, X, y, scoring='f1_macro', cv=skf,return_train_score=True)

{'fit_time': array([1.39923191, 1.35763001, 1.37009978, 1.3482151 , 1.37506628]),
 'score_time': array([0.3747561 , 0.37871504, 0.38309503, 0.37148499, 0.36713576]),
 'test_score': array([0.18856973, 0.19170563, 0.19295868, 0.19213188, 0.19431725]),
 'train_score': array([0.23093638, 0.22985213, 0.23161311, 0.23003482, 0.22856989])}

In [23]:
cross_validate(bern_nb, X, y, scoring='f1_macro', cv=skf,return_train_score=True)

{'fit_time': array([1.37863398, 1.37427616, 1.38343811, 1.36535597, 1.37179708]),
 'score_time': array([0.400985  , 0.39869285, 0.41357803, 0.39695597, 0.3951962 ]),
 'test_score': array([0.14083293, 0.14302528, 0.14079188, 0.14711167, 0.14208191]),
 'train_score': array([0.16245198, 0.16220059, 0.16173146, 0.16142293, 0.16140417])}

In [27]:
etree_w2v = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])

In [28]:
cross_validate(etree_w2v, X, y, scoring='f1_macro', cv=skf,return_train_score=True)

{'fit_time': array([58.02340817, 58.53746319, 57.82559299, 57.74854875, 58.61525416]),
 'score_time': array([8.05409312, 7.59044194, 6.72545886, 6.78239799, 6.73230886]),
 'test_score': array([0.31642043, 0.3344908 , 0.34937745, 0.31815292, 0.33891469]),
 'train_score': array([0.9998726 , 0.99983804, 0.99970773, 0.99972947, 0.99986676])}

In [29]:
cross_validate(etree_w2v_tfidf, X, y, scoring='f1_macro', cv=skf,return_train_score=True)

{'fit_time': array([68.36335588, 67.07578516, 69.75583506, 66.89327598, 66.52365017]),
 'score_time': array([7.55213904, 7.47810292, 7.93639278, 7.52639914, 7.43071198]),
 'test_score': array([0.32478018, 0.34318613, 0.35720333, 0.32587632, 0.35026093]),
 'train_score': array([0.9998726 , 0.99983804, 0.99970773, 0.99972947, 0.99986676])}

In [50]:
class HfBertClassifierModel(nn.Module):
    def __init__(self, n_classes, weights_name='bert-base-cased'):
        super().__init__()
        self.n_classes = n_classes
        self.weights_name = weights_name
        self.bert = BertModel.from_pretrained(self.weights_name)
        self.hidden_dim = self.bert.embeddings.word_embeddings.embedding_dim
        # The only new parameters -- the classifier layer:
        self.W = nn.Linear(self.hidden_dim, self.n_classes)
        
    def forward(self, X):
        """Here, `X` is an np.array in which each element is a pair 
        consisting of an index into the BERT embedding and a 1 or 0
        indicating whether the token is masked. The `fit` method will 
        train all these parameters against a softmax objective.
        
        """
        indices = X[: , 0, : ]
        # Type conversion, since the base class insists on
        # casting this as a FloatTensor, but we ned Long
        # for `bert`.
        indices = indices.long()
        mask = X[: , 1, : ]      
        (final_hidden_states, cls_output) = self.bert(
            indices, attention_mask=mask)       
        return self.W(cls_output)

In [51]:
class HfBertClassifier(TorchShallowNeuralClassifier):
    def __init__(self, weights_name='bert-base-cased', *args, **kwargs):
        self.weights_name = weights_name
        self.tokenizer = BertTokenizer.from_pretrained(self.weights_name)
        super().__init__(*args, **kwargs)
        
    def define_graph(self):
        """This method is used by `fit`. We override it here to use our
        new BERT-based graph.
        
        """
        bert = HfBertClassifierModel(
            self.n_classes_, weights_name=self.weights_name)
        bert.train()
        return bert
    
    def encode(self, X, max_length=None):
        """The `X` is a list of strings. We use the model's tokenizer
        to get the indices and mask information.
        
        Returns
        -------
        list of [index, mask] pairs, where index is an int and mask
        is 0 or 1.
        
        """
        data = self.tokenizer.batch_encode_plus(
            X, 
            max_length=max_length,
            add_special_tokens=True, 
            pad_to_max_length=True,
            return_attention_mask=True)
        indices = data['input_ids']
        mask = data['attention_mask']
        return [[i, m] for i, m in zip(indices, mask)]

In [53]:
hf_fine_tune_mod = HfBertClassifier(
    'bert-base-cased', 
    batch_size=16, # Crucial; large batches will eat up all your memory!
    max_iter=4, 
    eta=0.00002)


In [77]:
cross_validate(hf_fine_tune_mod, X, y, scoring='f1_macro')

{'fit_time': array([0.0058949 , 0.00813484, 0.00777006, 0.00820398, 0.00589013]),
 'score_time': array([0., 0., 0., 0., 0.]),
 'test_score': array([nan, nan, nan, nan, nan])}

In [None]:
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    break
    

X_hf_str_train = [" ".join(x) for x in X_train]
X_hf_str_dev = [" ".join(x) for x in X_test]
X_hf_indices_train = hf_fine_tune_mod.encode(X_hf_str_train)

X_hf_indices_dev = hf_fine_tune_mod.encode(X_hf_str_dev)
%time _ = hf_fine_tune_mod.fit(X_hf_indices_train, y_train)


In [None]:
hf_fine_tune_preds = hf_fine_tune_mod.predict(X_hf_indices_dev)

In [None]:
print(classification_report(hf_fine_tune_preds, y_hf_dev, digits=3))