# Beer Review

In [1]:
# import packages
import numpy as np
import pandas as pd
import sqlite3

# natural language processing
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
PATH = 'C:\\data\\'
conn = sqlite3.connect('beer.sqlite')

# split in 4 dfs due to memory issues
df1 = pd.read_sql_query('select * from df1', conn)
df2 = pd.read_sql_query('select * from df2', conn)
df3 = pd.read_sql_query('select * from df3', conn)
df4 = pd.read_sql_query('select * from df4', conn)

# close connection
conn.close()

Find most commom 3-gram

In [3]:
# add appropriate words that will be ignored in the analysis
ADDITIONAL_STOPWORDS = ['n', 'ratebeer', 'iphone', 'buddy', 'via', 'rated', 'trader']

def basic_clean(text):
    """
    A simple function to clean up the data. All the words that are not designated as a stop word
    are then lemmatized after encoding and basic regex parsing are performed.
    """
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]


def ngram(df):
    """
    Find most commom 3-gram for each beer.
    """
    # remove beers with less than 50 reviews
    df_filter = df.groupby(['beer_id']).agg(count = ('rating', 'count')).reset_index()
    df_filter = df_filter[df_filter['count']>50]
    df_filter = pd.merge(df_filter, df, on='beer_id')

    users = []
    beers = df_filter['beer_id'].drop_duplicates().to_list()
    for beer in beers:
        beer_id = df_filter[df_filter['beer_id']==beer]
        words = basic_clean(''.join(str(beer_id['text'].tolist())))
        ngram = pd.Series(nltk.ngrams(words, 3)).value_counts().to_frame(name='count').reset_index()
        ngram = ngram[ngram['count']>=10] # keep only ngrams that appear more than 10 times
        ngram['beer_id'] = beer
        users.append(ngram)
    return pd.concat(users)

In [4]:
ngrams1 = ngram(df1)
ngrams2 = ngram(df2)
ngrams3 = ngram(df3)
ngrams4 = ngram(df4)

In [5]:
# join data
ngrams = pd.concat([ngrams1, ngrams2, ngrams3, ngrams4])

# keep only top 20 ngram for each beer
ngrams.reset_index(inplace=True)
ngrams = ngrams[ngrams['level_0']<20]

# clean and prepare final data
ngrams['index'] = ngrams['index'].astype(str)
ngrams['word'] = ngrams['index'].str.replace(',', '').str.replace('(', '').str.replace(')', '').str.replace("'", '')
ngrams['word'] = ngrams['word'].str.replace(' head aroma', ' head').str.replace('head aroma ', 'aroma ')
ngrams = ngrams[['beer_id', 'count', 'word']]

Find adjectives (very time consuming)

In [8]:
ADDITIONAL_STOPWORDS =['clear','white','text','golden','yellow','pale','mexican','much',
                       'dark','bottle','brown','black','green','Indian','deep','tap','colour',
                       'reddish','canadian','offwhite','red','many','first','overall','large','clean',
                       'aroma','american','ale','low','amber','cold','japanese','high','lime','ligh','full',
                       'crystal','brownish','know','smal','Name','gold','head','beige','available','Second',
                       'isnt','malt','cant','tall','thi','une','several','middle','lemon','bottled','true',
                       'lager','initial','macro','drank','finish','brew','caramel','single','english','dont','close',
                       'dtype']

stop_words = list(set(stopwords.words('english')))+ ADDITIONAL_STOPWORDS
allowed_word_types = ["JJS", 'JJ']

def adj(df):
    # remove beers with less than 50 reviews
    df_filter = df.groupby(['beer_id']).agg(count = ('rating', 'count')).reset_index()
    df_filter = df_filter[df_filter['count']>50]
    df_filter = pd.merge(df_filter, df, on='beer_id')
    df_filter.reset_index(drop=True, inplace =True)
    df_filter.reset_index(inplace =True)
    index = df_filter['index'].to_list()
    
    all_words = []

    for p in index:

        # remove punctuations
        cleaned = re.sub(r'[^(a-zA-Z)\s]','', str(df_filter[df_filter['index']==p]['text']).lower())

        # tokenize 
        tokenized = word_tokenize(cleaned)

        # remove stopwords 
        stopped = [w for w in tokenized if not w in stop_words]

        # parts of speech tagging for each word 
        pos = nltk.pos_tag(stopped)
        pos_df = pd.DataFrame(pos ,columns=['word', 'type'])

        # make a list of  all adjectives identified by the allowed word types list above
        pos_df = pos_df[pos_df['type'].isin(allowed_word_types)]
        pos_df['pos'] = p
        pos_df.set_index('pos', inplace = True)
        pos_df = pos_df.join(df_filter[['beer_id']])
        all_words.append(pos_df)

    return pd.concat(all_words)

In [9]:
adj1 = adj(df1)
adj2 = adj(df2)
adj3 = adj(df3)
adj4 = adj(df4)

In [10]:
# join data
adj = pd.concat([adj1,adj2,adj3,adj4])

# group by beer
adj_agg = adj.groupby(['beer_id', 'word']).agg(count = ('word', 'count')).reset_index()

# remove words that are not common
adj_agg = adj_agg[adj_agg['count']>=5]

# remove words with less than 3 letters
adj_agg = adj_agg[adj_agg['word'].str.len()>2]

# remove words that doesnt have a good meaning alone
remove = ['small','thin','object','big','pour','nose','little','name','huge','poured','belgian','ive','ipa',
          'imperial','total','double','last','lambic','russian','tiny','massive','german','southern','triple',
          'trappist','scottish','danish','irish']

adj_agg = adj_agg[~adj_agg['word'].isin(remove)]

# remove words outside top 30 for each beer
adj_agg['rank'] = adj_agg.groupby('beer_id')['count'].rank(ascending=False)
adj_agg = adj_agg[adj_agg['rank']<30]

adj_agg = adj_agg[['beer_id', 'count', 'word']]

Join adjectives and ngrams and save

In [None]:
review = pd.concat([ngrams, adj_agg])
review.to_csv('beer review.csv', index=False, encoding='utf-8', decimal=',', sep='\t')