In [2]:
import matplotlib.pyplot as plt
import numpy as np
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")

import spacy

### Task A:

In [4]:
df_reviews = pd.read_csv("Comments.csv")

In [5]:
df_reviews

Unnamed: 0.1,Unnamed: 0,product_name,user_rating,product_review
0,0,Carlton Cold,2.01,looks like beer smells and tastes like urine w...
1,1,Carlton Cold,2.23,september 2008: 375 ml clear bottle courtesy o...
2,2,Carlton Cold,1.02,i recently reviewed victoria bitter which i dr...
3,3,Carlton Cold,2.95,carlton cold is filtered below zero degrees ce...
4,4,Carlton Cold,1.62,format: a standard clear 355ml bottle with tha...
...,...,...,...,...
7194,7207,Sculpin,4.08,smells great with an exquisite finish. with a ...
7195,7208,Bar Fly,4.57,yesterdays meandering around through the bottl...
7196,7209,Bar Fly,4.29,the smoky aroma is stronger than it follows on...
7197,7210,Bar Fly,4.16,midnight black body topped with a dense creamy...


### Task B :

In [6]:
# Download NLTK stopwords (you only need to do this once)
#nltk.download('stopwords')

# Extract the comment column
comments = df_reviews[df_reviews['product_review'].apply(lambda x: isinstance(x, str))]['product_review']

# Get the NLTK English stop words
stop_words = set(stopwords.words('english'))

# Tokenize and preprocess the comments (remove punctuation, convert to lowercase, and remove stop words)
def preprocesstext(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = re.findall(r'\b\w+\b', text.lower())  # Tokenize and convert to lowercase
    nswords = [word for word in words if word not in stop_words]
    return list(set(nswords))

# Tokenize and preprocess the comments, removing stop words
words_nostop = []
for comment in comments:
    words = preprocesstext(comment)
    words_nostop.extend(words)

# Calculate word frequencies
words_nostop_freq = pd.Series(words_nostop).value_counts()

wnf_df = pd.DataFrame(words_nostop_freq)
wnf_df = wnf_df.reset_index()
wnf_df = wnf_df.rename(columns={'index': 'words', 0: 'frequency'})

wnf_df.to_csv('review_words.csv')

In [7]:
import pandas as pd
file=pd.read_csv("beer attributes.csv")

In [8]:
file.head()

Unnamed: 0,attribute,frequency
0,light,2029
1,carbonation,1835
2,sweet,1669
3,malt,1654
4,white,1469


In [9]:
attributes=file[file["frequency"]>=700]["attribute"]

In [10]:
print("Available Attributes:")

for idx, attr in enumerate(attributes):
    print(f"{idx+1}.{attr}")

selected_attributes=[]

for i in range(3):
    while True:
         try:
              selection = int(input(f"Enter the number for attribute {i+1}:"))
              if selection>=1 and selection<=len(attributes):
                   selected_attributes.append(attributes[selection-1])
                   break
              else:
                   print("Invalid input.Please enter a valid number.")
         except ValueError:
              print("Invalid input. Please enter a number.") 

Available Attributes:
1.light
2.carbonation
3.sweet
4.malt
5.white
6.medium
7.dark
8.smooth
9.bitterness
10.clear
11.brown
12.chocolate
13.dry
14.sweetness
15.bitter
16.caramel
17.golden
18.black
19.fruit
Enter the number for attribute 1:1
Enter the number for attribute 2:2
Enter the number for attribute 3:3


In [11]:
selected_attributes

['light', 'carbonation', 'sweet']

### Task C:

In [57]:
#cosine similarity analysis
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

clean_df = df_reviews[df_reviews['product_review'].apply(lambda x: isinstance(x, str))].copy(deep=True)

clean_reviews = [" ".join(preprocesstext(review)) for review in clean_df["product_review"]]

clean_df.loc[:, "product_review"] = clean_reviews

In [58]:
def calc_similarity(reviews,attributes):
    vectorizer=CountVectorizer(vocabulary=attributes,binary=True) 
    attr_vec=vectorizer.transform([" ".join(attributes)])
    reviews_vec=vectorizer.fit_transform(reviews)
    similarity_scores=cosine_similarity(reviews_vec,attr_vec)
    return similarity_scores

similarity_scores=(calc_similarity(clean_reviews,selected_attributes))

In [59]:
results_df=clean_df.copy()
results_df.drop(["Unnamed: 0","user_rating"],axis=1,inplace=True)
results_df["similarity_score"]=similarity_scores
results_df.head()

Unnamed: 0,product_name,product_review,similarity_score
0,Carlton Cold,35 teeth like reconfigured horrible smells met...,0.0
1,Carlton Cold,write stavanger fruity pale head notes candy c...,0.57735
2,Carlton Cold,even ton loose vb faintly artificial disliked ...,0.57735
3,Carlton Cold,subtle drinking strength full cold finish crea...,0.0
4,Carlton Cold,35 even au40 damn fairly clinging overall card...,0.816497


In [60]:
results_df.groupby("product_name")["similarity_score"].mean().sort_values(ascending=False)[:3] #top3 recommendations in terms of cosine similarity

product_name
It Was All A Dream         1.000000
Wide Awake It's Morning    1.000000
Morning Wood               0.938832
Name: similarity_score, dtype: float64

### Task D:

In [61]:
#Sentiment Analysis
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
#initialize the sentiment analyzer
analyser = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\megha\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [62]:
#function for calculating sentiment score
def sentiment_scores(review):
    score = analyser.polarity_scores(review)['compound']
    return score

In [63]:
#performed on original data reviews
df_senti = df_reviews[df_reviews['product_review'].apply(lambda x: isinstance(x, str))].copy(deep=True)
df_senti["sentiment_score"] = df_senti["product_review"].apply(sentiment_scores)
df_senti.head()

Unnamed: 0.1,Unnamed: 0,product_name,user_rating,product_review,sentiment_score
0,0,Carlton Cold,2.01,looks like beer smells and tastes like urine w...,0.501
1,1,Carlton Cold,2.23,september 2008: 375 ml clear bottle courtesy o...,0.8658
2,2,Carlton Cold,1.02,i recently reviewed victoria bitter which i dr...,-0.996
3,3,Carlton Cold,2.95,carlton cold is filtered below zero degrees ce...,0.7845
4,4,Carlton Cold,1.62,format: a standard clear 355ml bottle with tha...,0.8535


In [64]:
results_df["sentiment_score"] = df_senti["sentiment_score"]
results_df.describe()

Unnamed: 0,similarity_score,sentiment_score
count,7199.0,7199.0
mean,0.355145,0.599568
std,0.36067,0.474119
min,0.0,-0.996
25%,0.0,0.4391
50%,0.57735,0.8126
75%,0.57735,0.94025
max,1.0,0.9993


In [65]:
results_df

Unnamed: 0,product_name,product_review,similarity_score,sentiment_score
0,Carlton Cold,35 teeth like reconfigured horrible smells met...,0.000000,0.5010
1,Carlton Cold,write stavanger fruity pale head notes candy c...,0.577350,0.8658
2,Carlton Cold,even ton loose vb faintly artificial disliked ...,0.577350,-0.9960
3,Carlton Cold,subtle drinking strength full cold finish crea...,0.000000,0.7845
4,Carlton Cold,35 even au40 damn fairly clinging overall card...,0.816497,0.8535
...,...,...,...,...
7194,Sculpin,citrus visuals prefer smell like exquisite fin...,0.000000,0.6956
7195,Bar Fly,smelled helped well fly slowly dragonne thanky...,0.816497,0.9988
7196,Bar Fly,damn second sad ways smoky truly undoubtedly s...,0.000000,0.9547
7197,Bar Fly,mild aging brown nice subtle creamy heat head ...,0.577350,0.9408


### Task E:

In [66]:
results_df["eval_score"] = results_df["similarity_score"]*results_df["sentiment_score"]
results_df.head()

Unnamed: 0,product_name,product_review,similarity_score,sentiment_score,eval_score
0,Carlton Cold,35 teeth like reconfigured horrible smells met...,0.0,0.501,0.0
1,Carlton Cold,write stavanger fruity pale head notes candy c...,0.57735,0.8658,0.49987
2,Carlton Cold,even ton loose vb faintly artificial disliked ...,0.57735,-0.996,-0.575041
3,Carlton Cold,subtle drinking strength full cold finish crea...,0.0,0.7845,0.0
4,Carlton Cold,35 even au40 damn fairly clinging overall card...,0.816497,0.8535,0.69688


In [71]:
results_df=results_df.groupby("product_name").mean().sort_values(by='eval_score',ascending=False).reset_index() #top3 recommendations in terms of cosine similarity

In [72]:
results_df

Unnamed: 0,product_name,similarity_score,sentiment_score,eval_score
0,Wide Awake It's Morning,1.000000,0.977600,0.977600
1,It Was All A Dream,1.000000,0.923100,0.923100
2,Morning Wood,0.938832,0.965533,0.910183
3,Vanilla Bean Assassin,0.816497,0.952800,0.777958
4,Pseudo Sue - Peacharine Dry-Hopped,0.816497,0.940000,0.767507
...,...,...,...,...
976,Skol,0.348462,-0.346675,-0.172083
977,Big Eye,0.288675,-0.305400,-0.176323
978,Warsteiner Premium Fresh,0.204124,-0.190850,-0.193591
979,Burleigh Bighead,0.398242,-0.345529,-0.196351


In [73]:
results_df.head(3)

Unnamed: 0,product_name,similarity_score,sentiment_score,eval_score
0,Wide Awake It's Morning,1.0,0.9776,0.9776
1,It Was All A Dream,1.0,0.9231,0.9231
2,Morning Wood,0.938832,0.965533,0.910183


### Task F:

In [74]:
nlp = spacy.load("en_core_web_md")

In [75]:
def calculate_similarity(attribute, review):
    attribute_doc = nlp(attribute)
    review_doc = nlp(review)
    return attribute_doc.similarity(review_doc)

In [78]:
product_scores = {}
for product_name, group in clean_df.groupby("product_name"):
    product_scores[product_name] = {}
    for attribute in selected_attributes:
        scores = [calculate_similarity(attribute, review) for review in group["product_review"]]
        avg_score = sum(scores) / len(scores)
        product_scores[product_name][attribute] = avg_score

In [79]:
for product, attributes in product_scores.items():
    avg_score = sum(attributes.values()) / len(attributes)
    product_scores[product]['avg'] = avg_score

In [87]:
df_word_vec = pd.DataFrame.from_dict(product_scores, orient='index')
df_word_vec.sort_values(by='avg',ascending=False)

Unnamed: 0,light,carbonation,sweet,avg
It Was All A Dream,0.714348,0.478803,0.557927,0.583693
Yellow Bus,0.695751,0.487932,0.534888,0.572857
Foggy Window,0.689283,0.492792,0.496927,0.559668
Apricot Funky Wit,0.647587,0.478003,0.525566,0.550386
Wide Awake It's Morning,0.682553,0.444270,0.524213,0.550345
...,...,...,...,...
1554,0.281195,0.145095,0.319084,0.248458
Gallo Draft,0.198033,0.074142,0.283718,0.185298
Speedway Stout - Vietnamese Coffee,0.144991,0.001808,0.355731,0.167510
Barrel Aged Christmas Bomb!,0.122759,0.022686,0.357056,0.167500


Update the Results Dataframe with Similarity score of Word vector:

In [93]:
df_word_vec=df_word_vec.reset_index().rename(columns={'index': 'product_name'})

In [100]:
results_df=pd.merge(results_df,df_word_vec[['product_name','avg']],on="product_name", how="inner")

In [101]:
results_df=results_df.rename(columns={'similarity_score': 'similarity_score_cosine','avg': 'similarity_score_wordvec'})

In [102]:
results_df

Unnamed: 0,product_name,similarity_score_cosine,sentiment_score,eval_score,similarity_score_wordvec
0,Wide Awake It's Morning,1.000000,0.977600,0.977600,0.550345
1,It Was All A Dream,1.000000,0.923100,0.923100,0.583693
2,Morning Wood,0.938832,0.965533,0.910183,0.540866
3,Vanilla Bean Assassin,0.816497,0.952800,0.777958,0.495319
4,Pseudo Sue - Peacharine Dry-Hopped,0.816497,0.940000,0.767507,0.546321
...,...,...,...,...,...
976,Skol,0.348462,-0.346675,-0.172083,0.362534
977,Big Eye,0.288675,-0.305400,-0.176323,0.273972
978,Warsteiner Premium Fresh,0.204124,-0.190850,-0.193591,0.441803
979,Burleigh Bighead,0.398242,-0.345529,-0.196351,0.454298


### Task H: Association (Lift analysis)

Top 4 attributes from Task B are: light, carbonation, sweet and malt

In [28]:
df1=df_reviews.copy()

In [29]:
# removing strop words
df1["cleaned_review_wo_stopwords"] = df1["product_review"].astype(str).apply(lambda review: preprocesstext(review))

In [30]:
# Top 4 attributes
top_4_attributes=["light","carbonation","sweet","malt"]
top_4_attributes

['light', 'carbonation', 'sweet', 'malt']

In [31]:
# Top 10 beers with highest comments
top_10_beer = df_reviews.groupby('product_name')['product_name'].count().sort_values(ascending=False)[:10]
top_10_beer = top_10_beer.index.to_list()
top_10_beer

['Oktoberfest',
 'IPA',
 'Porter',
 'Tripel',
 'Pale Ale',
 'Péché Mortel',
 'Camo Black Extra',
 'Four O Street Legal Malt Liquor',
 'Cisk XS Extra Lager',
 'Boatswain Double IPA (Twin Screw Steamer)']

In [32]:
lift_db = df1.copy()
lift_db = lift_db[['product_name','product_review','cleaned_review_wo_stopwords']].apply(pd.Series.explode).set_index(['product_name','product_review']).reset_index().drop_duplicates().copy()      
lift_db

Unnamed: 0,product_name,product_review,cleaned_review_wo_stopwords
0,Carlton Cold,looks like beer smells and tastes like urine w...,35
1,Carlton Cold,looks like beer smells and tastes like urine w...,teeth
2,Carlton Cold,looks like beer smells and tastes like urine w...,like
3,Carlton Cold,looks like beer smells and tastes like urine w...,reconfigured
4,Carlton Cold,looks like beer smells and tastes like urine w...,horrible
...,...,...,...
316786,Bar Fly,bottle at 2018 ris share. dark black pour smal...,share
316787,Bar Fly,bottle at 2018 ris share. dark black pour smal...,ashy
316788,Bar Fly,bottle at 2018 ris share. dark black pour smal...,gritty
316789,Bar Fly,bottle at 2018 ris share. dark black pour smal...,2018


In [33]:
def lift(n, a, b, ab):
    l = ((n*ab)/(a*b))
    return (l)

In [34]:
lift_values = pd.DataFrame(columns=['word_1','word_2','lift_val'])

for beer in top_10_beer:
    for attr in top_4_attributes:
        #Initialise lift to 0
        lift_db['beer'] = 0
        lift_db['attr'] = 0
        
        lift_db['beer'][lift_db['product_name'] == beer] = 1
        lift_db['attr'][lift_db['cleaned_review_wo_stopwords'] == attr] = 1
        
        c = lift_db.groupby(['product_name','product_review'])[['beer','attr']].sum().reset_index()   
        
        a = lift_db[lift_db['product_name']==beer]['product_review'].drop_duplicates().count()
        b = lift_db['attr'].sum()
        ab = c[(c['beer']>0) & (c['attr']==1)]['attr'].count()
        n = df1['product_review'].count()
        
        lift_val = lift(n, a, b, ab)
        
        lift_dict = {}
        lift_dict['word_1'] = beer
        lift_dict['word_2'] = attr
        lift_dict['lift_val'] = lift_val
        
        lift_values = lift_values.append(lift_dict, ignore_index=True)

In [35]:
similarity = pd.crosstab(lift_values['word_1'], lift_values['word_2'], lift_values['lift_val'], aggfunc=np.mean,rownames=['Beer'], colnames=['Attribute'])
similarity

Attribute,carbonation,light,malt,sweet
Beer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Boatswain Double IPA (Twin Screw Steamer),0.245198,0.443507,1.360972,0.808755
Camo Black Extra,0.490395,0.66526,1.088778,1.07834
Cisk XS Extra Lager,1.716383,1.33052,1.905361,2.695851
Four O Street Legal Malt Liquor,0.98079,1.108767,2.994139,2.695851
IPA,1.144255,0.887013,1.451704,0.629032
Oktoberfest,0.888263,1.004166,1.561267,1.790452
Pale Ale,0.923097,0.417418,1.537098,0.507454
Porter,1.217533,1.223467,1.20141,1.041156
Péché Mortel,1.384645,1.669672,1.537098,1.268636
Tripel,1.445375,0.746959,0.916866,1.135095
