In [105]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime
import re

from tqdm import tqdm
tqdm.pandas()

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

data = pd.read_csv(r'data.csv')
df = pd.DataFrame(data)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\cavit\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [138]:
df.head(3)
df.describe

<bound method NDFrame.describe of       beer_ABV  beer_beerId  beer_brewerId               beer_name  \
0          5.0        47986          10325            Sausa Weizen   
1          6.2        48213          10325                Red Moon   
2          6.5        48215          10325  Black Horse Black Beer   
3          5.0        47969          10325              Sausa Pils   
4          7.7        64883           1075           Cauldron DIPA   
...        ...          ...            ...                     ...   
1606      10.5         3635             22             La Terrible   
1607      10.5         3635             22             La Terrible   
1608      10.5         3635             22             La Terrible   
1609      10.5         3635             22             La Terrible   
1610      10.5         3635             22             La Terrible   

                          beer_style  review_appearance  review_palette  \
0                         Hefeweizen              

1. Rank top 3 Breweries which produce the strongest beers?

In [106]:

# group by brewerId and calculate the average ABV for each brewery
brewery_avg_abv = df.groupby('beer_brewerId')['beer_ABV'].mean()

# sort breweries by average ABV in descending order and select the top 3
top_3_breweries = brewery_avg_abv.sort_values(ascending=False).head(3)

print("Top 3 Breweries Producing the Strongest Beers:")
print(top_3_breweries)


Top 3 Breweries Producing the Strongest Beers:
beer_brewerId
22      10.500000
694     10.100000
2724     7.643243
Name: beer_ABV, dtype: float64


2. Which year did beers enjoy the highest ratings? 

In [107]:
# convert review_time to datetime
df['review_time'] = pd.to_datetime(df['review_time'], unit='s')

# extract year from review_time
df['year'] = df['review_time'].dt.year

# group by year and calculate the average rating for each year
average_ratings_by_year = df.groupby('year')['review_overall'].mean()

# find the year with the highest average rating
highest_rated_year = average_ratings_by_year.idxmax()

print("Year with the highest average ratings for beers:", highest_rated_year)

Year with the highest average ratings for beers: 2012


3. Based on the user’s ratings which factors are important among taste, aroma, appearance, and palette?

In [108]:
# Calculate correlation matrix
correlation_matrix = df[['review_taste', 'review_aroma', 'review_appearance', 'review_palette', 'review_overall']].corr()

# Extract correlations with review_overall
correlations_with_overall = correlation_matrix['review_overall'].drop('review_overall')

# Sort correlations in descending order
sorted_correlations = correlations_with_overall.sort_values(ascending=False)

print("Correlation between each factor and overall review rating:")
print(sorted_correlations)

Correlation between each factor and overall review rating:
review_aroma         0.846201
review_taste         0.783129
review_palette       0.739925
review_appearance    0.656940
Name: review_overall, dtype: float64


so review_aroma has highest corellation which is important

4. If you were to recommend 3 beers to your friends based on this data which ones will you recommend? * need to edit

In [109]:
# assigning custom weights
weights = {'review_overall': 0.4, 'review_taste': 0.2, 'review_aroma': 0.1, 'review_appearance': 0.1, 'review_palette': 0.2}
df['weighted_rating'] = (df[list(weights.keys())] * pd.Series(weights)).sum(axis=1)

# sort beers by weighted rating in descending order
recommended_beers = df.sort_values(by='weighted_rating', ascending=False).head(3)

print("Recommended beers for my friends:")
#print(recommended_beers[['beer_name', 'weighted_rating', 'review_text']])
recommended_beers[['beer_name', 'weighted_rating', 'review_text']].head(3)

Recommended beers for my friends:


Unnamed: 0,beer_name,weighted_rating,review_text
433,Caldera IPA,5.0,12 oz can poured into duvel snifter A - pours ...
1533,T.J.'s Best Bitter,5.0,Holy crap. This beer is amazing. Wow. Holy cra...
281,Old Growth Imperial Stout,4.95,Aroma is absolutely heavenly - smoky with firm...


how the weights were calculated:

Review Overall: This factor represents the overall review rating given by users. Since it reflects the overall satisfaction with the beer, it was assigned the highest weight of 0.4.
Review Taste: Taste is a crucial aspect of beer enjoyment, so it was assigned a weight of 0.2, reflecting its importance in the overall rating.
Review Aroma: Aroma contributes significantly to the sensory experience of drinking beer, but it may be slightly less important than taste. Therefore, it was assigned a weight of 0.1.
Review Appearance: While appearance can influence the initial impression of a beer, its impact on overall enjoyment may be somewhat lower compared to taste and aroma. Hence, it was assigned a weight of 0.1.
Review Palette: Palette, which likely refers to the mouthfeel or texture of the beer, was also considered important but slightly less so compared to taste and aroma. Therefore, it was assigned a weight of 0.2.

5. Which Beer style seems to be the favorite based on reviews written by users? 

In [110]:
# taking relevant columns
reviewTextData = data[['beer_beerId','beer_name','beer_ABV','beer_style','review_overall','review_text']]

# taking higher ranked reviews only >4 (from the overall reviews column)
reviewTextData = reviewTextData.loc[reviewTextData['review_overall'] >= 4]

# resetting Index
reviewTextData.reset_index(drop=True,inplace=True)

reviewTextData.head()

Unnamed: 0,beer_beerId,beer_name,beer_ABV,beer_style,review_overall,review_text
0,64883,Cauldron DIPA,7.7,American Double / Imperial IPA,4.0,"According to the website, the style for the Ca..."
1,52159,Caldera Ginger Beer,4.7,Herbed / Spiced Beer,4.0,I'm not sure why I picked this up... I like gi...
2,52159,Caldera Ginger Beer,4.7,Herbed / Spiced Beer,4.5,Poured from a 22oz bomber into my Drie Fontein...
3,52159,Caldera Ginger Beer,4.7,Herbed / Spiced Beer,5.0,"OK, so the only reason I bought this while sho..."
4,52159,Caldera Ginger Beer,4.7,Herbed / Spiced Beer,4.0,Notes from 6/24 A: Bright golden glowing beer ...


In [111]:
reviewTextData.review_text[0]

"According to the website, the style for the Caldera Cauldron changes every year. The current release is a DIPA, which frankly is the only cauldron I'm familiar with (it was an IPA/DIPA the last time I ordered a cauldron at the horsebrass several years back). In any event... at the Horse Brass yesterday. The beer pours an orange copper color with good head retention and lacing. The nose is all hoppy IPA goodness, showcasing a huge aroma of dry citrus, pine and sandlewood. The flavor profile replicates the nose pretty closely in this West Coast all the way DIPA. This DIPA is not for the faint of heart and is a bit much even for a hophead like myslf. The finish is quite dry and hoppy, and there's barely enough sweet malt to balance and hold up the avalanche of hoppy bitterness in this beer. Mouthfeel is actually fairly light, with a long, persistentely bitter finish. Drinkability is good, with the alcohol barely noticeable in this well crafted beer. Still, this beer is so hugely hoppy/bi

In [141]:
# text preprocessing
import re

# initial text processing replacing short forms
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"it\'s", "it is", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    return phrase

In [142]:
# extracting text reviews and applying text preprocessing on it
preprocessed_reviews = []

for sentance in tqdm(reviewTextData['review_text'].values): # tqdm prints the status bar
    sentance = decontracted(sentance) # deconstructiong short forms
    sentance = re.sub("\S*\d\S*", "", sentance).strip() # remove words with numbers 
    
    preprocessed_reviews.append(sentance) # form sentence again

100%|██████████| 835/835 [00:00<00:00, 14306.14it/s]


In [114]:
preprocessed_reviews[0]

'According to the website, the style for the Caldera Cauldron changes every year. The current release is a DIPA, which frankly is the only cauldron I am familiar with (it was an IPA/DIPA the last time I ordered a cauldron at the horsebrass several years back). In any event... at the Horse Brass yesterday. The beer pours an orange copper color with good head retention and lacing. The nose is all hoppy IPA goodness, showcasing a huge aroma of dry citrus, pine and sandlewood. The flavor profile replicates the nose pretty closely in this West Coast all the way DIPA. This DIPA is not for the faint of heart and is a bit much even for a hophead like myslf. The finish is quite dry and hoppy, and there is barely enough sweet malt to balance and hold up the avalanche of hoppy bitterness in this beer. Mouthfeel is actually fairly light, with a long, persistentely bitter finish. Drinkability is good, with the alcohol barely noticeable in this well crafted beer. Still, this beer is so hugely hoppy/

In [115]:
# appending preprocessed reviews to the filtered dataframe
reviewTextData['preprocessed_review_text'] = preprocessed_reviews

In [143]:
# instantiating Sentiment Analyzer
sianalyzer = SentimentIntensityAnalyzer()

# loop over the 'preprocessed_review_text' column and calculate the polarity score for each review
reviewTextData['polarity_score2'] = reviewTextData['preprocessed_review_text'].progress_apply(lambda x: sianalyzer.polarity_scores(x)['compound'])

100%|██████████| 835/835 [00:00<00:00, 910.08it/s]


In [144]:
# grouping and calculate mean polarity score.
reviewTextDataGroupped = reviewTextData.groupby('beer_style')['polarity_score2'].mean()

# sort the grouped data by mean polarity score
reviewTextDataGroupped.sort_values(ascending=False)[0:5]

beer_style
Dortmunder / Export Lager    0.9826
English Porter               0.9668
American Blonde Ale          0.9659
Märzen / Oktoberfest         0.9626
Cream Ale                    0.9587
Name: polarity_score2, dtype: float64

In [146]:
# observing the top 'polarity_score2' and 'beer_beerId' associated with i
reviewTextData.loc[reviewTextData['beer_style'] == 'Dortmunder / Export Lager']
reviewTextData.loc[reviewTextData['beer_style'] == 'American Blonde Ale']

Unnamed: 0,beer_beerId,beer_name,beer_ABV,beer_style,review_overall,review_text,preprocessed_review_text,polarity_score2
225,61427,Caldera Rose Petal (Kettle Series),6.7,American Blonde Ale,4.0,A- is cloudy and light glassy goldeness S- sme...,A- is cloudy and light glassy goldeness S- sme...,0.9693
226,61427,Caldera Rose Petal (Kettle Series),6.7,American Blonde Ale,4.0,It's a beautiful beer to look at. Pours crysta...,It is a beautiful beer to look at. Pours cryst...,0.9827
810,38275,Alaskan Summer Ale,5.5,American Blonde Ale,4.0,A: Poured a straw yellow color with a 1 finger...,A: Poured a straw yellow color with a finger ...,0.9457


By observing the mean compound polarity score , we can say that the beer style "Dortmunder / Export Lager" is liked most but has only one person that likes it as much, we can instead say "English Porter" is the most famous, based on combination of polarity and higher frequency

6. How does written review compare to overall review score for the beer styles?

By observing the mean compound polarity score calculated we can get an idea how the user written review text is collaborating in calculating the overall review score.

7. How do find similar beer drinkers by using written reviews only?   