In [37]:
import matplotlib.pyplot as plt
import numpy as np
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")

import spacy

### Task A:

In [38]:
df_reviews = pd.read_csv("Comments.csv")

In [39]:
df_reviews

Unnamed: 0.1,Unnamed: 0,product_name,user_rating,product_review
0,0,Carlton Cold,2.01,looks like beer smells and tastes like urine w...
1,1,Carlton Cold,2.23,september 2008: 375 ml clear bottle courtesy o...
2,2,Carlton Cold,1.02,i recently reviewed victoria bitter which i dr...
3,3,Carlton Cold,2.95,carlton cold is filtered below zero degrees ce...
4,4,Carlton Cold,1.62,format: a standard clear 355ml bottle with tha...
...,...,...,...,...
7207,7207,Sculpin,4.08,smells great with an exquisite finish. with a ...
7208,7208,Bar Fly,4.57,yesterdays meandering around through the bottl...
7209,7209,Bar Fly,4.29,the smoky aroma is stronger than it follows on...
7210,7210,Bar Fly,4.16,midnight black body topped with a dense creamy...


In [72]:
test = df_reviews.groupby('product_name').nunique().reset_index()
test[test['user_rating'] >=5]['user_rating'].sum()

6283

### Task B :

In [40]:
# Download NLTK stopwords (you only need to do this once)
#nltk.download('stopwords')

# Extract the comment column
comments = df_reviews[df_reviews['product_review'].apply(lambda x: isinstance(x, str))]['product_review']

# Get the NLTK English stop words
stop_words = set(stopwords.words('english'))

# Tokenize and preprocess the comments (remove punctuation, convert to lowercase, and remove stop words)
def preprocesstext(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = re.findall(r'\b\w+\b', text.lower())  # Tokenize and convert to lowercase
    nswords = [word for word in words if word not in stop_words]
    return list(set(nswords))

# Tokenize and preprocess the comments, removing stop words
words_nostop = []
for comment in comments:
    words = preprocesstext(comment)
    words_nostop.extend(words)

# Calculate word frequencies
words_nostop_freq = pd.Series(words_nostop).value_counts()

wnf_df = pd.DataFrame(words_nostop_freq)
wnf_df = wnf_df.reset_index()
wnf_df = wnf_df.rename(columns={'index': 'words', 0: 'frequency'})

wnf_df.to_csv('review_words.csv')

In [41]:
import pandas as pd
file=pd.read_csv("beer attributes.csv")

In [42]:
file.head()

Unnamed: 0,attribute,frequency
0,light,2029
1,carbonation,1835
2,sweet,1669
3,malt,1654
4,white,1469


In [43]:
attributes=file[file["frequency"]>=700]["attribute"]

In [44]:
print("Available Attributes:")

for idx, attr in enumerate(attributes):
    print(f"{idx+1}.{attr}")

Available Attributes:
1.light
2.carbonation
3.sweet
4.malt
5.white
6.medium
7.dark
8.smooth
9.bitterness
10.clear
11.brown
12.chocolate
13.dry
14.sweetness
15.bitter
16.caramel
17.golden
18.black
19.fruit


In [46]:
selected_attributes = ['light', 'carbonation', 'sweet']
selected_attributes

['light', 'carbonation', 'sweet']

### Task C:

In [11]:
#cosine similarity analysis
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

clean_df = df_reviews[df_reviews['product_review'].apply(lambda x: isinstance(x, str))].copy(deep=True)

clean_reviews = [" ".join(preprocesstext(review)) for review in clean_df["product_review"]]

clean_df.loc[:, "product_review"] = clean_reviews

In [12]:
def calc_similarity(reviews,attributes):
    vectorizer=CountVectorizer(vocabulary=attributes,binary=True) 
    attr_vec=vectorizer.transform([" ".join(attributes)])
    reviews_vec=vectorizer.fit_transform(reviews)
    similarity_scores=cosine_similarity(reviews_vec,attr_vec)
    return similarity_scores

similarity_scores=(calc_similarity(clean_reviews,selected_attributes))

In [51]:
def attr_similarity(reviews, attributes):
    vectorizer=CountVectorizer(vocabulary=attributes,binary=True)
    review_dtm=vectorizer.fit_transform(reviews)
    attr_dtm=vectorizer.transform(attributes)
    indv_sim_table=cosine_similarity(review_dtm,attr_dtm)
    return indv_sim_table

sim_table=attr_similarity(clean_reviews,selected_attributes)
results_df2=clean_df.copy(deep=True)
results_df2.drop(["Unnamed: 0","user_rating"],inplace=True,axis=1)
results_df2.loc[:,selected_attributes]=sim_table

results_df2.drop("product_review",inplace=True,axis=1)
bow_df=results_df2.groupby("product_name").agg(np.mean)

bow_df["avg_cosine_sim_score"]=bow_df.iloc[:,0:3].mean(axis=1)
bow_df = bow_df.reset_index()
bow_df.sort_values(by = 'avg_cosine_sim_score', ascending=False)

Unnamed: 0,product_name,light,carbonation,sweet,avg_cosine_sim_score
483,It Was All A Dream,0.57735,0.577350,0.577350,0.577350
958,Wide Awake It's Morning,0.57735,0.577350,0.577350,0.577350
617,Morning Wood,0.38490,0.620602,0.620602,0.542035
794,Scaled Way Up,0.00000,0.707107,0.707107,0.471405
929,Vanilla Bean Assassin,0.00000,0.707107,0.707107,0.471405
...,...,...,...,...,...
312,Double Dry Hopped Congress Street,0.00000,0.000000,0.000000,0.000000
127,Black Tuesday - Rum Barrel-Aged,0.00000,0.000000,0.000000,0.000000
831,Speedway Stout - Vietnamese Coffee,0.00000,0.000000,0.000000,0.000000
586,Midnight Still,0.00000,0.000000,0.000000,0.000000


In [13]:
results_df=clean_df.copy()
results_df.drop(["Unnamed: 0","user_rating"],axis=1,inplace=True)
results_df["similarity_score"]=similarity_scores
results_df.head()

Unnamed: 0,product_name,product_review,similarity_score
0,Carlton Cold,like smells heavy looks years edge ba deserved...,0.0
1,Carlton Cold,pub citrus bar notes 2008 mouthfeel write undr...,0.57735
2,Carlton Cold,lager recently artificial 355ml whilst complex...,0.57735
3,Carlton Cold,easy drinking degrees zero bitterness dry prov...,0.0
4,Carlton Cold,pops advertising swirling rare clinging 355ml ...,0.816497


In [14]:
results_df.groupby("product_name")["similarity_score"].mean().sort_values(ascending=False)[:3] #top3 recommendations in terms of cosine similarity

product_name
It Was All A Dream         1.000000
Wide Awake It's Morning    1.000000
Morning Wood               0.938832
Name: similarity_score, dtype: float64

### Task D:

In [15]:
#Sentiment Analysis
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
#initialize the sentiment analyzer
analyser = SentimentIntensityAnalyzer()
for word in attributes[:10] : 
  analyser.lexicon[word] = 0

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/milindbhatia/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [16]:
#function for calculating sentiment score
def sentiment_scores(review):
    score = analyser.polarity_scores(review)['compound']
    return score

In [17]:
#performed on original data reviews
df_senti = df_reviews[df_reviews['product_review'].apply(lambda x: isinstance(x, str))].copy(deep=True)
df_senti["sentiment_score"] = df_senti["product_review"].apply(sentiment_scores)
df_senti.head()

Unnamed: 0.1,Unnamed: 0,product_name,user_rating,product_review,sentiment_score
0,0,Carlton Cold,2.01,looks like beer smells and tastes like urine w...,0.501
1,1,Carlton Cold,2.23,september 2008: 375 ml clear bottle courtesy o...,0.8658
2,2,Carlton Cold,1.02,i recently reviewed victoria bitter which i dr...,-0.996
3,3,Carlton Cold,2.95,carlton cold is filtered below zero degrees ce...,0.7845
4,4,Carlton Cold,1.62,format: a standard clear 355ml bottle with tha...,0.8535


In [18]:
results_df["sentiment_score"] = df_senti["sentiment_score"]
results_df.describe()

Unnamed: 0,similarity_score,sentiment_score
count,7199.0,7199.0
mean,0.355145,0.599568
std,0.36067,0.474119
min,0.0,-0.996
25%,0.0,0.4391
50%,0.57735,0.8126
75%,0.57735,0.94025
max,1.0,0.9993


In [19]:
results_df

Unnamed: 0,product_name,product_review,similarity_score,sentiment_score
0,Carlton Cold,like smells heavy looks years edge ba deserved...,0.000000,0.5010
1,Carlton Cold,pub citrus bar notes 2008 mouthfeel write undr...,0.577350,0.8658
2,Carlton Cold,lager recently artificial 355ml whilst complex...,0.577350,-0.9960
3,Carlton Cold,easy drinking degrees zero bitterness dry prov...,0.000000,0.7845
4,Carlton Cold,pops advertising swirling rare clinging 355ml ...,0.816497,0.8535
...,...,...,...,...
7207,Sculpin,like taste hops exquisite smells citrus strong...,0.000000,0.6956
7208,Bar Fly,lovely foamed tongue excellent warming york sm...,0.816497,0.9988
7209,Bar Fly,ways still game copy follows different tongue ...,0.000000,0.9547
7210,Bar Fly,cocoa palate heavy grain sweetness brown chewy...,0.577350,0.9408


### Task E:

In [20]:
results_df["eval_score"] = results_df["similarity_score"]*results_df["sentiment_score"]
results_df.head()

Unnamed: 0,product_name,product_review,similarity_score,sentiment_score,eval_score
0,Carlton Cold,like smells heavy looks years edge ba deserved...,0.0,0.501,0.0
1,Carlton Cold,pub citrus bar notes 2008 mouthfeel write undr...,0.57735,0.8658,0.49987
2,Carlton Cold,lager recently artificial 355ml whilst complex...,0.57735,-0.996,-0.575041
3,Carlton Cold,easy drinking degrees zero bitterness dry prov...,0.0,0.7845,0.0
4,Carlton Cold,pops advertising swirling rare clinging 355ml ...,0.816497,0.8535,0.69688


In [21]:
results_df=results_df.groupby("product_name").mean().sort_values(by='eval_score',ascending=False).reset_index() #top3 recommendations in terms of cosine similarity

In [22]:
results_df

Unnamed: 0,product_name,similarity_score,sentiment_score,eval_score
0,Wide Awake It's Morning,1.000000,0.977600,0.977600
1,It Was All A Dream,1.000000,0.923100,0.923100
2,Morning Wood,0.938832,0.965533,0.910183
3,Vanilla Bean Assassin,0.816497,0.952800,0.777958
4,Pseudo Sue - Peacharine Dry-Hopped,0.816497,0.940000,0.767507
...,...,...,...,...
976,Skol,0.348462,-0.346675,-0.172083
977,Big Eye,0.288675,-0.305400,-0.176323
978,Warsteiner Premium Fresh,0.204124,-0.190850,-0.193591
979,Burleigh Bighead,0.398242,-0.345529,-0.196351


In [23]:
results_df.head(3)

Unnamed: 0,product_name,similarity_score,sentiment_score,eval_score
0,Wide Awake It's Morning,1.0,0.9776,0.9776
1,It Was All A Dream,1.0,0.9231,0.9231
2,Morning Wood,0.938832,0.965533,0.910183


### Task F:

In [24]:
# Download NLTK stopwords (you only need to do this once)
#nltk.download('stopwords')
# Get the NLTK English stop words
stop_words = set(stopwords.words('english'))

wv_df = df_reviews.copy()

# Tokenize and preprocess the comments (remove punctuation, convert to lowercase, and remove stop words)
def preprocesstext(text):
    if isinstance(text, str):  # Check if 'text' is a string
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        words = re.findall(r'\b\w+\b', text.lower())  # Tokenize and convert to lowercase
        nswords = [word for word in words if word not in stop_words]
        return ' '.join(nswords)
    else:
        return ''

wv_df['review_processed'] = wv_df['product_review'].apply(preprocesstext)
wv_df = wv_df[wv_df['review_processed'] != '']
wv_df

Unnamed: 0.1,Unnamed: 0,product_name,user_rating,product_review,review_processed
0,0,Carlton Cold,2.01,looks like beer smells and tastes like urine w...,looks like beer smells tastes like urine metal...
1,1,Carlton Cold,2.23,september 2008: 375 ml clear bottle courtesy o...,september 2008 375 ml clear bottle courtesy ca...
2,2,Carlton Cold,1.02,i recently reviewed victoria bitter which i dr...,recently reviewed victoria bitter drank old ti...
3,3,Carlton Cold,2.95,carlton cold is filtered below zero degrees ce...,carlton cold filtered zero degrees celsius cre...
4,4,Carlton Cold,1.62,format: a standard clear 355ml bottle with tha...,format standard clear 355ml bottle old carlton...
...,...,...,...,...,...
7207,7207,Sculpin,4.08,smells great with an exquisite finish. with a ...,smells great exquisite finish citrus like smel...
7208,7208,Bar Fly,4.57,yesterdays meandering around through the bottl...,yesterdays meandering around bottle backlog to...
7209,7209,Bar Fly,4.29,the smoky aroma is stronger than it follows on...,smoky aroma stronger follows tongue damn good ...
7210,7210,Bar Fly,4.16,midnight black body topped with a dense creamy...,midnight black body topped dense creamy light ...


In [25]:
nlp = spacy.load("en_core_web_md")

In [26]:
def calculate_similarity(attribute, review):
    attribute_doc = nlp(attribute)
    review_doc = nlp(review)
    return attribute_doc.similarity(review_doc)

In [27]:
att_str = " ".join(selected_attributes)
att_str

'light carbonation sweet'

In [28]:
product_scores_ind = {}

for product_name, group in wv_df.groupby("product_name"):
    product_scores_ind[product_name] = {}
    scores_i = [calculate_similarity(att_str, review) for review in group["product_review"]]
    avg_score_i = sum(scores_i) / len(scores_i)
    product_scores_ind[product_name] = avg_score_i

df_wv_ind = pd.DataFrame(list(product_scores_ind.items()), columns=['product_name', 'sim_score_wb'])
df_wv_ind.sort_values(by='sim_score_wb',ascending=False)

Unnamed: 0,product_name,sim_score_wb
483,It Was All A Dream,0.789814
774,Saison Du Fermier,0.753576
484,JJJuiceee Machine,0.752772
958,Wide Awake It's Morning,0.750309
587,Miles To Go Before I Sleep,0.748864
...,...,...
853,Sunday Brunch - Bourbon Barrel-Aged,0.345906
383,Gallo Draft,0.302377
301,Dorothy (Wine Barrel Aged),0.299244
831,Speedway Stout - Vietnamese Coffee,0.149222


In [29]:
product_scores = {}
for product_name, group in wv_df.groupby("product_name"):
    product_scores[product_name] = {}
    for attribute in selected_attributes:
        scores = [calculate_similarity(attribute, review) for review in group["product_review"]]
        avg_score = sum(scores) / len(scores)
        product_scores[product_name][attribute] = avg_score

In [30]:
for product, attributes in product_scores.items():
    avg_score = sum(attributes.values()) / len(attributes)
    product_scores[product]['avg'] = avg_score

In [31]:
df_word_vec = pd.DataFrame.from_dict(product_scores, orient='index')
df_word_vec.sort_values(by='avg',ascending=False)

Unnamed: 0,light,carbonation,sweet,avg
It Was All A Dream,0.671358,0.526672,0.446191,0.548074
Saison Du Fermier,0.614872,0.475878,0.493253,0.528001
JJJuiceee Machine,0.640558,0.539010,0.380973,0.520181
Miles To Go Before I Sleep,0.636033,0.510247,0.411062,0.519114
Yellow Bus,0.621678,0.591516,0.334177,0.515791
...,...,...,...,...
Sunday Brunch - Bourbon Barrel-Aged,0.314484,0.260993,0.130953,0.235477
Dorothy (Wine Barrel Aged),0.201893,-0.009890,0.488343,0.226782
Gallo Draft,0.250675,0.164469,0.223288,0.212811
Speedway Stout - Vietnamese Coffee,0.047414,-0.083530,0.410913,0.124932


Update the Results Dataframe with Similarity score of Word vector:

In [32]:
df_word_vec=df_word_vec.reset_index().rename(columns={'index': 'product_name'})

In [33]:
results_df=pd.merge(results_df,df_wv_ind[['product_name','sim_score_wb']],on="product_name", how="inner")

In [34]:
results_df=results_df.rename(columns={'similarity_score': 'similarity_score_cosine','sim_score_wb': 'similarity_score_wordvec'})

In [35]:
results_df

Unnamed: 0,product_name,similarity_score_cosine,sentiment_score,eval_score,similarity_score_wordvec
0,Wide Awake It's Morning,1.000000,0.977600,0.977600,0.750309
1,It Was All A Dream,1.000000,0.923100,0.923100,0.789814
2,Morning Wood,0.938832,0.965533,0.910183,0.709751
3,Vanilla Bean Assassin,0.816497,0.952800,0.777958,0.690755
4,Pseudo Sue - Peacharine Dry-Hopped,0.816497,0.940000,0.767507,0.724321
...,...,...,...,...,...
976,Skol,0.348462,-0.346675,-0.172083,0.494020
977,Big Eye,0.288675,-0.305400,-0.176323,0.498637
978,Warsteiner Premium Fresh,0.204124,-0.190850,-0.193591,0.619603
979,Burleigh Bighead,0.398242,-0.345529,-0.196351,0.650943


### Task H: Association (Lift analysis)

Top 4 attributes from Task B are: light, carbonation, sweet and malt

In [28]:
df1=df_reviews.copy()

In [29]:
# removing strop words
df1["cleaned_review_wo_stopwords"] = df1["product_review"].astype(str).apply(lambda review: preprocesstext(review))

In [30]:
# Top 4 attributes
top_4_attributes=["light","carbonation","sweet","malt"]
top_4_attributes

['light', 'carbonation', 'sweet', 'malt']

In [31]:
# Top 10 beers with highest comments
top_10_beer = df_reviews.groupby('product_name')['product_name'].count().sort_values(ascending=False)[:10]
top_10_beer = top_10_beer.index.to_list()
top_10_beer

['Oktoberfest',
 'IPA',
 'Porter',
 'Tripel',
 'Pale Ale',
 'Péché Mortel',
 'Camo Black Extra',
 'Four O Street Legal Malt Liquor',
 'Cisk XS Extra Lager',
 'Boatswain Double IPA (Twin Screw Steamer)']

In [32]:
lift_db = df1.copy()
lift_db = lift_db[['product_name','product_review','cleaned_review_wo_stopwords']].apply(pd.Series.explode).set_index(['product_name','product_review']).reset_index().drop_duplicates().copy()      
lift_db

Unnamed: 0,product_name,product_review,cleaned_review_wo_stopwords
0,Carlton Cold,looks like beer smells and tastes like urine w...,35
1,Carlton Cold,looks like beer smells and tastes like urine w...,teeth
2,Carlton Cold,looks like beer smells and tastes like urine w...,like
3,Carlton Cold,looks like beer smells and tastes like urine w...,reconfigured
4,Carlton Cold,looks like beer smells and tastes like urine w...,horrible
...,...,...,...
316786,Bar Fly,bottle at 2018 ris share. dark black pour smal...,share
316787,Bar Fly,bottle at 2018 ris share. dark black pour smal...,ashy
316788,Bar Fly,bottle at 2018 ris share. dark black pour smal...,gritty
316789,Bar Fly,bottle at 2018 ris share. dark black pour smal...,2018


In [33]:
def lift(n, a, b, ab):
    l = ((n*ab)/(a*b))
    return (l)

In [34]:
lift_values = pd.DataFrame(columns=['word_1','word_2','lift_val'])

for beer in top_10_beer:
    for attr in top_4_attributes:
        #Initialise lift to 0
        lift_db['beer'] = 0
        lift_db['attr'] = 0
        
        lift_db['beer'][lift_db['product_name'] == beer] = 1
        lift_db['attr'][lift_db['cleaned_review_wo_stopwords'] == attr] = 1
        
        c = lift_db.groupby(['product_name','product_review'])[['beer','attr']].sum().reset_index()   
        
        a = lift_db[lift_db['product_name']==beer]['product_review'].drop_duplicates().count()
        b = lift_db['attr'].sum()
        ab = c[(c['beer']>0) & (c['attr']==1)]['attr'].count()
        n = df1['product_review'].count()
        
        lift_val = lift(n, a, b, ab)
        
        lift_dict = {}
        lift_dict['word_1'] = beer
        lift_dict['word_2'] = attr
        lift_dict['lift_val'] = lift_val
        
        lift_values = lift_values.append(lift_dict, ignore_index=True)

In [35]:
similarity = pd.crosstab(lift_values['word_1'], lift_values['word_2'], lift_values['lift_val'], aggfunc=np.mean,rownames=['Beer'], colnames=['Attribute'])
similarity

Attribute,carbonation,light,malt,sweet
Beer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Boatswain Double IPA (Twin Screw Steamer),0.245198,0.443507,1.360972,0.808755
Camo Black Extra,0.490395,0.66526,1.088778,1.07834
Cisk XS Extra Lager,1.716383,1.33052,1.905361,2.695851
Four O Street Legal Malt Liquor,0.98079,1.108767,2.994139,2.695851
IPA,1.144255,0.887013,1.451704,0.629032
Oktoberfest,0.888263,1.004166,1.561267,1.790452
Pale Ale,0.923097,0.417418,1.537098,0.507454
Porter,1.217533,1.223467,1.20141,1.041156
Péché Mortel,1.384645,1.669672,1.537098,1.268636
Tripel,1.445375,0.746959,0.916866,1.135095
