# Задание 4. Реализовать рекомендательную систему по подбору пива на основе  датасета «BeerAdvocate»

## Downloading and unzipping data

In [1]:
# Change directory for kaggle JSON
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks")

In [2]:
# Create a kaggle folder
!mkdir -p ~/.kaggle

# Copy kaggle.json to created folder
!cp kaggle.json ~/.kaggle/

In [3]:
os.chdir("/content")

In [4]:
# Permission for the json to act
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
# Download the required dataset
!kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate

Downloading 1-5-million-beer-reviews-from-beer-advocate.zip to /content
 92% 30.0M/32.5M [00:02<00:00, 17.2MB/s]
100% 32.5M/32.5M [00:02<00:00, 12.5MB/s]


In [6]:
# Unzip our dataset
from zipfile import ZipFile
from tqdm import tqdm


file_to_extract = "1-5-million-beer-reviews-from-beer-advocate.zip"

# Open your .zip file
with ZipFile(file=file_to_extract) as zip_file:

    # Loop over each file and extract them
    for file in tqdm(iterable=zip_file.namelist(), total=len(zip_file.namelist())):
        zip_file.extract(member=file)

100%|██████████| 1/1 [00:01<00:00,  1.13s/it]


## Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import time
import warnings
from typing import List
warnings.filterwarnings("ignore")

In [2]:
start_time = time.time()
df = pd.read_csv("/content/beer_reviews.csv", index_col="index")
end_time = time.time()

print(f"Elapsed time to read csv-file is: {end_time - start_time}")
df.head()

Elapsed time to read csv-file is: 3.5783870220184326


Unnamed: 0_level_0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


Let's look at the [dataset's](https://www.kaggle.com/datasets/thedevastator/1-5-million-beer-reviews-from-beer-advocate/data) columns description from Kaggle

|Column name| Description                                         |
|---|---------------|
|brewery_name| The name of the brewery that made the beer. (String) |
|review_time|The date and time of the review. (String)|
|review_overall|The reviewer's overall rating of the beer on a scale of 1 to 5. (Float)|
|review_aroma|The reviewer's rating of the beer's aroma on a scale of 1 to 5. (Float)|
|review_appearance|The reviewer's rating of the beer's appearance on a scale of 1 to 5. (Float)|
|review_profilename|The reviewer's username. (String)|
|beer_style|The style of beer. (String)|
|review_palate|The reviewer's rating of the beer's palate on a scale of 1 to 5. (Float)|
|review_taste|The reviewer's rating of the beer's taste on a scale of 1 to 5. (Float)|
|beer_name|The name of the beer. (String)|
|beer_abv|The alcohol by volume of the beer. (Float)|
|brewery_id||
|beer_beerid||

Checking for null values

In [3]:
df.isna().sum()

brewery_id                0
brewery_name             15
review_time               0
review_overall            0
review_aroma              0
review_appearance         0
review_profilename      348
beer_style                0
review_palate             0
review_taste              0
beer_name                 0
beer_abv              67785
beer_beerid               0
dtype: int64

Let's choose the main features

In [4]:
df_1 = df[["beer_name", "review_profilename", "review_overall"]]
df_1.head()

Unnamed: 0_level_0,beer_name,review_profilename,review_overall
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Sausa Weizen,stcules,1.5
1,Red Moon,stcules,3.0
2,Black Horse Black Beer,stcules,3.0
3,Sausa Pils,stcules,3.0
4,Cauldron DIPA,johnmichaelsen,4.0


Drop for duplicacted and null values

In [5]:
df_1.drop_duplicates(inplace=True)
df_1.isna().sum()
df_1.dropna(inplace=True)

Let's look at the overall number of reviews for each beer drink. I used grouping by `beer name`

In [6]:
beer_rating_count = (
    df_1.groupby(by=["beer_name"])["review_overall"]
    .count()
    .reset_index()
    .rename(columns={"review_overall": "review_overall_count"})
    )

Let's join the obtained dataframe to the original one

In [7]:
rating_with_review_overall_count = df_1.merge(beer_rating_count, on="beer_name", how="left")
rating_with_review_overall_count.head(10)

Unnamed: 0,beer_name,review_profilename,review_overall,review_overall_count
0,Sausa Weizen,stcules,1.5,1
1,Red Moon,stcules,3.0,1
2,Black Horse Black Beer,stcules,3.0,1
3,Sausa Pils,stcules,3.0,1
4,Cauldron DIPA,johnmichaelsen,4.0,1
5,Caldera Ginger Beer,oline73,3.0,9
6,Caldera Ginger Beer,Reidrover,3.5,9
7,Caldera Ginger Beer,alpinebryant,3.0,9
8,Caldera Ginger Beer,LordAdmNelson,4.0,9
9,Caldera Ginger Beer,augustgarage,4.5,9


We will choose only those beer products for which we have more than 100 evaluations for a more accurate prediction

P.S. In this example I used 100 as minimum of overviews count because of the memory troubles

In [8]:
rating_popular_beer = rating_with_review_overall_count[rating_with_review_overall_count["review_overall_count"] > 100]
rating_popular_beer.shape

(1132678, 4)

In [9]:
beers_list = rating_popular_beer["beer_name"].unique()
beers_list

array(['Amstel Light', 'Caldera Pale Ale', 'Caldera IPA', ...,
       'Alaskan IPA', 'Alaskan White', "Drake's Drakonic Imperial Stout"],
      dtype=object)

There could be a scenario where the user either types in the beer's name incorrectly or completely forgets what it is called. The following function solves this problem

In [10]:
!pip install fuzzywuzzy



In [11]:
from fuzzywuzzy import process

In [12]:
def get_beers_list(beers: List[str]) -> List[str]:
    """
    Filters a list of beers based on their presence in the interaction matrix or closest match.

    Parameters:
    - beers (List[str]): List of beers to filter.
    - interaction_matrix (DataFrame): DataFrame containing the interaction matrix.

    Returns:
    - user_beers (List[str]): Filtered list of beers, considering their presence or closest match in the interaction matrix.
    """
    user_beers = []
    for beer in beers:
        if beer in beers_list:
            user_beers.append(beer)
        else:
            closest_match = process.extractOne(beer, beers_list)[0]
            user_beers.append(closest_match)
    return user_beers

### SVD Model

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
rating_popular_beer.head()

Unnamed: 0,beer_name,review_profilename,review_overall,review_overall_count
10,Amstel Light,fodeeoz,3.0,464
18,Amstel Light,jdhilt,2.5,464
30,Amstel Light,xXTequila,3.0,464
40,Amstel Light,Brent,3.0,464
63,Caldera Pale Ale,NJpadreFan,4.0,162


Let's encode `beer_name` and `review_profilename` columns

In [15]:
user_encoder = LabelEncoder()
beer_encoder = LabelEncoder()

In [16]:
rating_popular_beer["beer_id"] = beer_encoder.fit_transform(rating_popular_beer["beer_name"])
rating_popular_beer["user_id"] = user_encoder.fit_transform(rating_popular_beer["review_profilename"])

In [17]:
from scipy.linalg import svd

Make the ratings matrix by assigning columns to users and rows to beers.

In [18]:
ratings_mat = np.ndarray(
    shape=(np.max(rating_popular_beer["beer_id"]), np.max(rating_popular_beer["user_id"])),
    dtype=np.uint8
    )

ratings_mat[rating_popular_beer["beer_id"].values-1,
            rating_popular_beer["user_id"].values-1] = rating_popular_beer["review_overall"].values

I had problems with memory overflow. That's why i used CUDA and Google Colab

Let's normalize matrix

In [None]:
import cupy as cp

In [19]:
# Moving data to the GPU
ratings_mat_gpu = cp.asarray(ratings_mat)

# Вычисляем средние значения по строкам на GPU
row_means = cp.mean(ratings_mat_gpu, axis=1, keepdims=True)

In [20]:
# Normalizing на GPU
normalized_mat_gpu = ratings_mat_gpu - row_means

In [21]:
from cupy.linalg import svd

The essence of SVD is that it decomposes a matrix of any shape into a product of 3 matrices with nice mathematical properties:

$ A = USV^T $

Compute SVD

In [22]:
# Вычисляем A на GPU
A_gpu = normalized_mat_gpu.T / cp.sqrt(ratings_mat.shape[0] - 1)

# Вычисляем SVD на GPU
U_gpu, S_gpu, V_gpu = svd(A_gpu)

# Перемещаем результаты на CPU, если это необходимо
U = U_gpu
S = S_gpu
V = V_gpu

Calculate cosine similarity, sort by most similar and return the top N.

In [23]:
def top_cosine_similarity(data: np.ndarray, beer_id: int, top_n: int = 10) -> tuple:

    """
    Returns the top N cosine similarities between the specified beer and other beers in the dataset.

    Parameters:
    - data (np.ndarray): The dataset containing beer vectors.
    - beer_id (int): The ID of the beer for which to find similarities.
    - top_n (int): The number of top similarities to return. Default is 10.

    Returns:
    - tuple: A tuple containing two elements:
        1. An array of indices representing the top N similar beers.
        2. An array of cosine similarity scores corresponding to the top N similar beers.
    """
    index = beer_id - 1
    beer_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(beer_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n], similarity

In [25]:
k = 50
top_n = 8

# Selects the first k columns of the transposed V matrix
sliced = V.T[:, :k]

Testing

In [26]:
user_beers = ["Kronenbourg 1664", "hoegaarden", "Baltika #3 Classic", "Heineken"]
# user_beers = ["Budweiser"]
user_beers = get_beers_list(user_beers)

In [27]:
# Create an empty list to hold all recommendations
all_recommendations = []

# Create a list to store pairs (beer index, similarity value)
all_similarity_pairs = []

# Get top 7 recommendations for every beer on the list
for beer in user_beers:
    beer_id = rating_popular_beer[rating_popular_beer["beer_name"] == beer]["beer_id"].values[0]
    indexes, similarities = top_cosine_similarity(sliced, beer_id, top_n)

    recommendations = cp.asnumpy(indexes)
    all_recommendations.extend(recommendations)
    # Create an array of tuples (beer index, similarity value) for the current beer
    beer_similarity_pairs = list(zip(indexes, similarities))
    all_similarity_pairs.extend(beer_similarity_pairs)


# Remove duplicate beers from the list of recommendations
unique_recommendations = list(set(all_recommendations))

# Sort the beers by similarity values (in descending order)
sorted_beer_similarity_pairs = sorted(all_similarity_pairs, key=lambda x: x[1], reverse=True)

print("Top 7 recommended beers:")
for i, (beer_id, similarity_score) in enumerate(sorted_beer_similarity_pairs[:7], start=1):
    beer_name = rating_popular_beer[rating_popular_beer["beer_id"] == int(beer_id)]["beer_name"].values[0]
    print(f"{i}. {beer_name} - Сходство: {similarity_score}")

Top 7 recommended beers:
1. Lectio Divina - Сходство: 0.3580298852389022
2. Bad Elmer's Porter - Сходство: 0.24096991910040866
3. Thunderhead IPA - Сходство: 0.2090101774447005
4. Carib Lager - Сходство: 0.1746065688136378
5. Gumballhead - Сходство: 0.14659831043484864
6. Corne Du Diable - Сходство: 0.13913600942260215
7. Coors - Сходство: 0.11829029586685684
