# Analysis on Wine Enthusiast Reviews

In [1]:
import numpy as np
import pandas as pd

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.manifold import MDS

from gensim import corpora, models, similarities 


import re
import os
import codecs
import dill

import multiprocess as multiprocessing

%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import mpld3

sns.set_context('notebook')
sns.set_palette('dark')
sns.set_style('white')

In [2]:
data = pd.read_pickle('../pkl/07_wine_enthusiast_data_cleaned.pkl')

In [3]:
data.shape

(131824, 15)

In [4]:
# shorten to 10,000 randomly selected reviews to speed up initial analysis
data = data.loc[np.random.choice(data.index, 2000)]
data.shape

(2000, 15)

In [5]:
data.head(2)

Unnamed: 0,alcohol,appellation,bottle_size,category,date_published,designation,importer,list_url_no,price,rating,review,title,url,variety,winery
174834,,"Tierra Manchuela, Central Spain, Spain",,Red,2004-03-01,Protocolo,Jorge Ordoñez Selections,5841,6.0,86.0,"Subdued and earthy, with cool, compact red-fru...",Dominio de Eguren 2001 Protocolo Tempranillo (...,http://www.winemag.com/buying-guide/dominio-eg...,Tempranillo,Dominio de Eguren
155689,12.0,"The Hamptons, Long Island, Long Island, New Yo...",,White,2006-08-01,Reserve,,5202,20.0,84.0,"Still young-looking, with a touch of green to ...","Wölffer 2002 Reserve Chardonnay (The Hamptons,...",http://www.winemag.com/buying-guide/wolffer-20...,Chardonnay,WÃ¶lffer


## Stemming and tokenizing

In [14]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['s',"'s"])
stopwords.extend([x.lower() for x in data.variety.unique()])
stemmer = SnowballStemmer('english')

In [15]:
wordnet_lemmatizer = WordNetLemmatizer()

def tokenize_with_lemmatizing(text):
    text = text.lower()
    # Tokenize sentences and words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    # Remove non-alphanumeric terms, convert to lower case
    tokens = [word for word in tokens if re.search('[a-zA-Z]', word)]
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords]
    
    # Lemmatize the words
    tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

def tokenize_with_stemming(text):
    text = text.lower()
    
    # Tokenize sentences and words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    # Remove non-alphanumeric terms, convert to lower case
    tokens = [word for word in tokens if re.search('[a-zA-Z]', word)]
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords]
    
    # Stemm the words
    tokens = [stemmer.stem(word) for word in tokens]
    
    return tokens
    
def tokenize(text):
    text = text.lower()
    
    # Tokenize sentences and words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    # Remove non-alphanumeric terms, convert to lower case
    tokens = [word for word in tokens if re.search('[a-zA-Z]', word)]
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords]
    
    return tokens

In [16]:
data['word_token'] = data.review.apply(tokenize)
data['word_stem'] = data.review.apply(tokenize_with_stemming)
data['word_lemm'] = data.review.apply(tokenize_with_lemmatizing)

In [17]:
data[['word_token','word_stem']].head(2)

Unnamed: 0,word_token,word_stem
174834,"[subdued, earthy, cool, compact, red-fruit, ar...","[subdu, earthi, cool, compact, red-fruit, arom..."
155689,"[still, young-looking, touch, green, pale, yel...","[still, young-look, touch, green, pale, yellow..."


In [18]:
stem_df = pd.DataFrame({'stem':sum(data.word_stem.tolist(),[]), 
                        'lemm':sum(data.word_lemm.tolist(),[]), 
                        'word':sum(data.word_token.tolist(),[])}).drop_duplicates().reset_index(drop=True)

In [19]:
stem_df.shape

(6205, 3)

In [20]:
stem_df.head()

Unnamed: 0,lemm,stem,word
0,subdued,subdu,subdued
1,earthy,earthi,earthy
2,cool,cool,cool
3,compact,compact,compact
4,red-fruit,red-fruit,red-fruit


## Tf-idf

# TODO: select optimized TF-IDF parameters

In [13]:
# TF-IDF for words, stemmed words, lemmatized words with a range of min/max values
tfidf_list = list()
corpus = data.review.tolist()

for max_df in [0.8, 0.85, 0.9, 0.95]:
    for min_df in [0.2, 0.15, 0.1, 0.05]:
        for nam,tokenizer in zip(['word','stem','lemm'], 
                                 [tokenize, tokenize_with_stemming, tokenize_with_lemmatizing]):
            
                label = 'tfidf_{}_{}_{}'.format(nam, min_df, max_df)
                
                tfidf = TfidfVectorizer(max_df = max_df, max_features=200000,
                                       min_df=min_df, stop_words='english', use_idf=True,
                                       tokenizer=tokenizer, ngram_range=(1,3))
                
                tfidf_matrix = tfidf.fit_transform(corpus)
                tfidf_list.append(pd.Series({'label':label, 'tfidf':tfidf, 'tfidf_matrix':tfidf_matrix}))

In [14]:
tfidf_df = pd.concat(tfidf_list, axis=1).T

In [15]:
tfidf_df['feature_names'] = tfidf_df.tfidf.apply(lambda x: x.get_feature_names())

In [16]:
tfidf_df['dist'] = tfidf_df.tfidf_matrix.apply(lambda x: 1 - cosine_similarity(x))

## K-means clustering

# TODO: select optimized K-means clusters

In [17]:
# Run kmeans clustering on entire range of tfidf matrices
kmeans_list = list()

# for num_clusters in range(3,16):
for num_clusters in range(3,11):
    
    for idx,dat in tfidf_df.iterrows():
        km = KMeans(n_clusters=num_clusters, n_jobs=-1)
        km.fit(dat.tfidf_matrix)
        kmeans_list.append(pd.Series({'num_clusters':num_clusters, 
                                      'label':dat['label']+'_{}'.format(num_clusters), 
                                      'km':km}))
                           
kmeans_df = pd.concat(kmeans_list, axis=1).T
                           
with open('../pkl/08_wine_enthusiast_kmeans_models.pkl','wb') as fh:
    dill.dump(kmeans_df, fh)
    
with open('../pkl/08_wine_enthusiast_kmeans_models.pkl','rb') as fh:
    kmeans_df = dill.load(fh)

NameError: name 'dill' is not defined

In [28]:
# Create a matrix of cluster values

cluster_df = pd.DataFrame([], index=data.index)

for idx,dat in kmeans_df.iterrows():
    km = dat.km
    label = dat.label
    num_clusters = dat.num_clusters
    cluster = km.labels_
    
    if (num_clusters >= 3)&(cluster.max()>0):
        cluster_df[label] = cluster
        
cluster_df[['category','variety','price','rating','title','appellation']] = data[['category','variety','price','rating','title','appellation']]

In [77]:
# Create a table of category counts per cluster
total_list = list()

tfidf_cols = pd.Series([x for x in cluster_df.columns if 'tfidf' in x],
                       index=pd.Index([int(re.search(r"""_([0-9]+)$""", x).group(1)) 
                       for x in cluster_df.columns if re.search(r"""_([0-9]+)$""", x)]))

std_cols = ['category', 'variety', 'price', 'rating', 'title', 'appellation']

# fig, axList = plt.subplots(nrows=8,ncols=6)
# fig.set_size_inches(16,20)
# color_list = sns.color_palette()
        
for nclusters in range(3,11):
    print(nclusters)
    col_list = tfidf_cols.loc[tfidf_cols.index == nclusters].values.tolist()

    sum_list = list()
    for col in col_list:

        # Create a table of number of categories in each cluster
        table = (pd.concat([data[['category']], cluster_df[[col]]], axis=1)
                 .groupby([col,'category'])
                 .size()
                 .to_frame()
                 .reset_index()
                 .rename(columns={0:'count',col:'cluster'}))
        table['col'] = col
        table = table.set_index(['col','cluster','category'])
        table_norm = table.div(table.groupby(level=[0,1]).transform(sum))
        table_norm = table_norm.rename(columns={'count':'norm'})

        # Calcualate the normalized difference between red and white wines
        table_diff = (table_norm
                 .reset_index(level=-1)
                 .groupby(level=1, group_keys=False)
                 .apply(lambda x: np.abs(x.loc[x.category=='Red','norm'] 
                                    - x.loc[x.category=='White','norm'])))

        # collect the tables
        df = pd.concat([table, table_norm], axis=1)
        sum_list.append(df)

#     # Save the figure
#     for ax,df in zip(axList.flatten(),sum_list):
#         ax.cla()

#         sns.barplot(x='cluster',
#                     y='norm',
#                     hue='category',
#                     data=df.reset_index(level=0,drop=True).reset_index(),
#                     palette={'Red':color_list[2],'Rose':'pink','White':'white'}, ax=ax)
#         ax.set(title=df.index.get_level_values(0).unique()[0],xlabel='',ylabel='')
#         ax.set_xticklabels([])
#         ax.legend().set_visible(False)

#     plt.tight_layout()
#     fig.savefig('clusters_{}.pdf'.format(nclusters))

    total_list.extend(sum_list)

3
4
5
6
7
8
9
10


In [117]:
km = kmeans_df.km.iloc[0]

# order_centroids
# order_centroids.shape,  len(terms)

for idx,dat in kmeans_df.iterrows():
    nclusters = dat.num_clusters
    km = dat.km
    label = dat.label
    order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
    print("TF-IDF %s:" % label)
    
    for ind in range(nclusters):
        label_short = re.sub('_{}'.format(nclusters),'',label)
        terms = tfidf_df.loc[tfidf_df.label==label_short].feature_names.values
        for col in order_centroids[ind]:
            print(terms[0][col],end=',')
            
        print()
    print()
        #print(' %s' % stem_df.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
#         print(i, ind)#, terms[ind])

TF-IDF tfidf_word_0.2_0.8_3:
flavors,finish,palate,aromas,cherry,tannins,acidity,wine,fruit,
fruit,wine,flavors,finish,aromas,cherry,tannins,acidity,palate,
wine,acidity,flavors,tannins,cherry,finish,aromas,palate,fruit,

TF-IDF tfidf_stem_0.2_0.8_3:
finish,palat,aroma,flavor,cherri,acid,tannin,fruit,wine,
fruit,wine,flavor,acid,tannin,finish,cherri,aroma,palat,
wine,flavor,cherri,acid,tannin,finish,aroma,palat,fruit,

TF-IDF tfidf_lemm_0.2_0.8_3:
flavor,finish,aroma,palate,fruit,wine,acidity,cherry,tannin,
cherry,tannin,flavor,wine,finish,fruit,aroma,acidity,palate,
wine,fruit,acidity,flavor,tannin,finish,cherry,aroma,palate,

TF-IDF tfidf_word_0.15_0.8_3:
flavors,finish,palate,aromas,cherry,tannins,black,oak,acidity,ripe,wine,fruit,
wine,acidity,flavors,ripe,tannins,cherry,oak,finish,black,aromas,palate,fruit,
fruit,wine,flavors,finish,oak,aromas,cherry,tannins,black,ripe,acidity,palate,

TF-IDF tfidf_stem_0.15_0.8_3:
wine,fruit,acid,flavor,ripe,rich,drink,tannin,dri,spice,oak,finish

## Multidimensional scaling 

# NOTE: also try PCA here.

In [30]:
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1, n_jobs=-1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

## Visualizing review clusters

In [136]:
# #set up colors per clusters using a dict
# cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

# #set up cluster names using a dict
# cluster_names = {0: '0', 
#                  1: '1', 
#                  2: '2'}

In [137]:
# df = pd.DataFrame(dict(x=xs, y=ys, label=clusters)) 

# #group by cluster
# groups = df.groupby('label')


# # set up plot
# fig, ax = plt.subplots(figsize=(17, 9)) # set size
# ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

# #iterate through groups to layer the plot
# #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
# for name, group in groups:
#     ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
#             label=cluster_names[name], color=cluster_colors[name], 
#             mec='none')
#     ax.set_aspect('auto')
#     ax.tick_params(\
#         axis= 'x',          # changes apply to the x-axis
#         which='both',      # both major and minor ticks are affected
#         bottom='off',      # ticks along the bottom edge are off
#         top='off',         # ticks along the top edge are off
#         labelbottom='off')
#     ax.tick_params(\
#         axis= 'y',         # changes apply to the y-axis
#         which='both',      # both major and minor ticks are affected
#         left='off',      # ticks along the bottom edge are off
#         top='off',         # ticks along the top edge are off
#         labelleft='off')
    
# ax.legend(numpoints=1)  #show legend with only 1 point

# #add label in x,y position with the label as the film title
# for i in range(len(df)):
#     ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  


In [138]:
# from scipy.cluster.hierarchy import ward, dendrogram

# linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

# fig, ax = plt.subplots(figsize=(15, 20)) # set size
# ax = dendrogram(linkage_matrix, orientation="right")

## Gensim

In [38]:
texts = data.word_stem.tolist()
dictionary = corpora.Dictionary(texts)

In [39]:
dictionary.filter_extremes(no_below=1, no_above=0.8)

#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in texts]

In [60]:
models.LdaModel?

In [122]:
totalvocab_stemmed = [['a', 'b', 'cow'],['ernie','is','my','cat']]

In [124]:
[[' '.join(x)] for x in totalvocab_stemmed]

[['a b cow'], ['ernie is my cat']]

In [125]:
[' '.join(x) for x in totalvocab_stemmed]

['a b cow', 'ernie is my cat']

In [43]:
%time lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary, update_every=5, chunksize=10000, passes=100)

CPU times: user 11min 19s, sys: 1.61 s, total: 11min 20s
Wall time: 11min 16s


In [44]:
! echo "pushover 'scrape finished'" | /bin/zsh

In [45]:
lda.show_topics()

[(0,
  '0.027*flavor + 0.025*wine + 0.019*acid + 0.017*fruit + 0.015*finish + 0.014*appl + 0.011*palat + 0.011*note + 0.010*crisp + 0.010*citrus'),
 (1,
  '0.025*flavor + 0.020*finish + 0.012*aroma + 0.011*fruit + 0.009*palat + 0.009*like + 0.008*feel + 0.007*littl + 0.006*oak + 0.005*chardonnay'),
 (2,
  '0.027*flavor + 0.024*wine + 0.016*fruit + 0.015*black + 0.015*tannin + 0.014*aroma + 0.013*finish + 0.011*cherri + 0.011*blackberri + 0.010*oak'),
 (3,
  '0.033*wine + 0.028*fruit + 0.014*flavor + 0.014*tannin + 0.012*cherri + 0.012*spice + 0.011*aroma + 0.009*acid + 0.009*drink + 0.009*red'),
 (4,
  '0.029*cherri + 0.021*flavor + 0.014*finish + 0.014*pinot + 0.011*drink + 0.010*wine + 0.009*soft + 0.009*noir + 0.008*spice + 0.008*cola')]

In [52]:
import dill

In [59]:
with open('lda','wb') as fh:
    dill.dump(lda, fh)
    
with open('corpus','wb') as fh:
    dill.dump(corpus, fh)
    
with open('dictionary','wb') as fh:
    dill.dump(dictionary, fh)

In [232]:
import funcy as fp
# import pyLDAvis.gensim

ImportError: No module named 'funcy'

In [48]:
python --version

NameError: name 'python' is not defined

In [49]:
!which python

/home/ubuntu/miniconda/envs/scienv3/bin/python
