# NLP on reviews

In [1]:
#From nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import stop_words
import string
from tqdm import tqdm, tqdm_notebook

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from collections import Counter


In [2]:
df_rev = pd.read_csv('./data_ratebeer_withReview.csv')
df_rev.head(3)

Unnamed: 0,name,score,beer style,abv,cal,city,country,link,id,reviews
0,Ivanych Zhivoe Svetloe,2.84,Zwickel/Keller/Landbier,4.5,135.0,Sukhum,Abkhazia,/beer/ivanych-zhivoe-svetloe/301247/,301247,Bottle. Clear deep golden colour. Moderate whi...
1,Stara Praha Oldenburg Tiemnoye,2.52,Dunkel/Tmavý,4.7,141.0,Stara Gagra,Abkhazia,/beer/stara-praha-oldenburg-tiemnoye/334787/,334787,Bottle thanks to zwr. Cloudy amber beer with a...
2,Stara Praha Oldenburg Svetloe,2.51,Pale Lager,4.7,141.0,Stara Gagra,Abkhazia,/beer/stara-praha-oldenburg-svetloe/334785/,334785,Вполне обычный лагер. Пробова& пиво когда заез...


In [3]:
df_rev.shape

(4999, 10)

In [4]:
df_des = pd.read_csv('./beer_desc_vocab_clean.csv')
df_des.head(3)

Unnamed: 0,sens,types,description
0,taste,finish,dry fruity sweet alcoholic warming bitter acid...
1,taste,intensity,assertive mild bold balanced robust intense me...
2,taste,notes,roasted bready bitter sweet spicy fruity choco...


## Get frequent words in text

In [5]:
wordlist = " ".join([txt.lower() for txt in df_des.description]).split(" ")

In [6]:
vocab_list = sorted(Counter(wordlist), key=lambda w: w.lower())

In [7]:
def get_freq_words(txt):
    bow = CountVectorizer(vocabulary=vocab_list)
    bow_transformer = bow.fit_transform([txt])
    vocab_used = list(bow.get_feature_names()) 
    counts = bow_transformer.sum(axis=0).A1
    freq_dist = Counter(dict(zip(vocab_used, counts)))
    most_com = freq_dist.most_common(10)
    freq = [mc for mc in most_com if mc[1] > 0]
    return freq

In [8]:
fr = get_freq_words("this beer is and dry, with a orange color and robust intensity")

In [9]:
fr

[('orange', 1), ('robust', 1), ('beer', 1), ('dry', 1)]

## Text Pre-processing

In [10]:
stop_words_ = stop_words.get_stop_words("en")
punctuation_filter = str.maketrans({key: None for key in string.punctuation})

def nlp_pre_process(text):
    """
    Reduces an input text into a list of tokens, 
    using NLP filterings such as normalization, 
    stop word filtering.
    text: (str)
    tokens: list(str)
    output: str
    """
    text = text.lower()  # normalize
    text = text.translate(punctuation_filter)  # remove punctuation
    tokens = word_tokenize(text)  # tokenize
    tokens = [t for t in tokens if t not in stop_words_]  # stop words filtering    
    return " ".join(tokens)

In [11]:
tqdm.pandas(tqdm_notebook())
df_rev.reviews = df_rev.reviews.progress_apply(nlp_pre_process)

100%|██████████| 4999/4999 [00:17<00:00, 293.11it/s]


In [13]:
df_rev['most_freq'] = df_rev.reviews.progress_apply(get_freq_words)

100%|██████████| 4999/4999 [00:03<00:00, 1289.67it/s]


In [14]:
df_rev

Unnamed: 0,name,score,beer style,abv,cal,city,country,link,id,reviews,most_freq
0,Ivanych Zhivoe Svetloe,2.84,Zwickel/Keller/Landbier,4.5,135.0,Sukhum,Abkhazia,/beer/ivanych-zhivoe-svetloe/301247/,301247,bottle clear deep golden colour moderate white...,"[(bread, 2), (malty, 2), (clear, 2), (sweet, 2..."
1,Stara Praha Oldenburg Tiemnoye,2.52,Dunkel/Tmavý,4.7,141.0,Stara Gagra,Abkhazia,/beer/stara-praha-oldenburg-tiemnoye/334787/,334787,bottle thanks zwr cloudy amber beer allmost he...,"[(fruity, 2), (dark, 1), (lemon, 1), (brown, 1..."
2,Stara Praha Oldenburg Svetloe,2.51,Pale Lager,4.7,141.0,Stara Gagra,Abkhazia,/beer/stara-praha-oldenburg-svetloe/334785/,334785,вполне обычный лагер пробова пиво когда заезжа...,"[(tart, 2), (hazy, 1), (lemon, 1), (grass, 1)]"
3,Stara Praha Oldenburg Jantarnoye,2.49,Amber Lager/Vienna,4.5,135.0,Stara Gagra,Abkhazia,/beer/stara-praha-oldenburg-jantarnoye/334786/,334786,bottled thanks omhper golden colour mediumsize...,"[(clear, 1), (lemon, 1), (light, 1), (spritzy,..."
4,Assir Lager (Svetloe),2.37,Pale Lager,5.0,150.0,Alakhadzykh Village Gagra Region,Abkhazia,/beer/assir-lager-svetloe/191169/,191169,bottled zappa right tasting clear pale golden ...,"[(clear, 1), (sweet, 1)]"
5,Sukhumskoe Svetloe Pivo Klassicheskoe,2.03,Pale Lager,4.0,120.0,Sukhum,Abkhazia,/beer/sukhumskoe-svetloe-pivo-klassicheskoe/15...,152260,bottle thanks yantarcoast tip find foggy honey...,"[(light, 8), (beer, 4), (hazy, 3), (sweet, 3),..."
6,Kaltenbeer Gruri Blek,3.18,Dunkelweizen,5.0,150.0,Durrës,Albania,/beer/kaltenbeer-gruri-blek/440920/,440920,nice hefeweizen proper ingredients used eg wey...,"[(beer, 2), (medium, 2), (light, 2), (caramel,..."
7,Brauhaus Rose,3.10,Amber Lager/Vienna,6.5,195.0,Tirana,Albania,/beer/brauhaus-rose/283953/,283953,tap brewpub first favourite like really nice r...,"[(beer, 2), (amber, 2), (deep, 2), (white, 2),..."
8,Birra Puka,3.04,Zwickel/Keller/Landbier,5.2,156.0,Pukë,Albania,/beer/birra-puka/436987/,436987,different sorts interesting creamy quite cloud...,"[(creamy, 1), (cloudy, 1)]"
9,Kaltenbeer Terminator Dark Double Malt,3.04,Dunkler Bock,5.5,165.0,Durrës,Albania,/beer/kaltenbeer-terminator-dark-double-malt/4...,440908,tap hotel albion brewpub durres poured hazy co...,"[(medium, 2), (caramel, 2), (copper, 1), (hazy..."


In [33]:
toDrop = list(df_rev[df_rev.most_freq.apply(len) == 0].index)

In [35]:
df_rev = df_rev.drop(df_rev.index[toDrop])

In [37]:
df_rev = df_rev.reset_index(drop=True)

In [40]:
df_rev.shape

(4872, 11)

In [46]:
df_rev.most_freq[0]

('bread', 2)

In [42]:
df_most_freq = df_rev.drop(['score','beer style','abv','cal','city','country','link','reviews'], axis=1)

In [44]:
df_most_freq.to_csv('freq_desc_words.csv')