In [None]:
import time

import asyncio
import aiohttp

import sqlite3
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns

import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")

from selenium import webdriver
from bs4 import BeautifulSoup

# Contents
## 1. Get URLs for each wine 
## 2. Get HTML for each wine 
## 3. Loading the data (start executing here after imports)
## 4. Data exploration and visualization
## 5. Rank wines based on rating and price 
## 6. General wine search
## 7. Search for wines based on notes
## 8. Wine-food pairing
## 9. Comprehensive search

# 1. Get URLs for each wine

In [None]:
class WinePageHTML:
    def __init__(self):
        self.wine_list = []
        self.parsed_pages = []

    async def get_html_from_page(self, url):
        op = webdriver.ChromeOptions()
        op.add_argument("headless")
        driver = webdriver.Chrome(options=op)
        driver.get(url)
        time.sleep(1)
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        driver.close()
        return soup

    async def get_page_html(self, page_num=1):
        url = (
            "https://www.wineenthusiast.com/?s=&search_type=ratings&page="
            + str(page_num)
            + "&drink_type=wine"
        )
        return await self.get_html_from_page(url)

    async def split_html(self, soup):
        text_break = "ratings-block__info"
        return str(soup).split(text_break)

    async def get_wine_list(self, html_array):
        # backup of different ver of the function is available
        array = []
        for section in html_array[
            1:
        ]:  # section iterates directly over content instead of indexes
            break_1 = '<a href="'
            break_2 = '">'
            wine_url = section.split(break_1)[1].split(break_2)[0]
            break_3 = "</a>"
            wine_title = section.split(break_1)[1].split(break_2)[1].split(break_3)[0]
            array.append([wine_title, wine_url])
        return array

    async def get_list_by_page(self, page_num=1):
        if page_num in self.parsed_pages:
            print("Page", page_num, "already parsed.")
            return
        html = await self.get_page_html(page_num)
        split_html = await self.split_html(html)
        wine_list = await self.get_wine_list(split_html)
        self.wine_list += wine_list
        self.parsed_pages.append(page_num)

    async def get_list_page_range(self, first_page=1, last_page=10):
        for page in range(first_page, last_page + 1):
            await self.get_list_by_page(page)

    def drop_duplicates(
        self,
    ):  # remove duplicates that appear in HTML code of the website
        unique_titles = []
        unique_wines = []
        for wine in self.wine_list:
            title, url = wine
            if title not in unique_titles:
                unique_titles.append(title)
                unique_wines += [wine]
        self.wine_list = unique_wines

In [None]:
total_pages = 48
wine_parser = WinePageHTML()
await wine_parser.get_list_page_range(last_page=total_pages)

In [None]:
len(wine_parser.wine_list)

In [None]:
wine_parser.drop_duplicates()

In [None]:
len(wine_parser.wine_list)

In [None]:
wine_list = wine_parser.wine_list

In [None]:
wine_list[0]

# 2. Get HTML for each wine

In [None]:
class WineParser:
    def __init__(self):
        self.wine_html = None
        # pass  # can also pass a link so that the html is scraped in the class
        op = webdriver.ChromeOptions()
        op.add_argument("headless")
        self.driver = webdriver.Chrome(options=op)

    async def get_html(self, url):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.9999.999 Safari/537.36",
            "Referer": "https://www.wineenthusiast.com",
        }

        async with aiohttp.ClientSession(headers=headers) as session:
            async with session.get(url) as response:
                html = await response.read()
                self.wine_html = str(BeautifulSoup(html, "html.parser"))

    def close_driver(self):
        self.driver.close()

    def get_wine_title(self, wine_html):
        break_1 = '<h1 class="review-title">'
        break_2 = "</h1>"
        return wine_html.split(break_1)[1].split(break_2)[0].strip()

    def get_year_from_title(self, title):
        title_split = title.split(" ")
        for string in title_split:
            try:
                return float(string)
            except:
                pass

    def get_rating(self, wine_html):
        break_1 = '<div class="key">Rating</div>'
        break_2 = '<div class="value">'
        break_3 = "</div>"
        return float(
            self.wine_html.split(break_1)[1].split(break_2)[1].split(break_3)[0].strip()
        )

    def get_price(self, wine_html):
        break_1 = '<div class="key">Price</div>'
        break_2 = '<div class="value">'
        break_3 = "</div>"
        try:
            return float(
                wine_html.split(break_1)[1]
                .split(break_2)[1]
                .split(break_3)[0]
                .strip()
                .replace("$", "")
            )
        except:
            return np.nan

    def get_winery(self, wine_html):
        break_1 = '<div class="key">Winery</div>'
        break_2 = '">'
        break_3 = "</a>"
        return wine_html.split(break_1)[1].split(break_2)[2].split(break_3)[0]

    def get_location(self, wine_html):
        break_1 = "<!-- Get regions, state, appellation -->"
        break_2 = "<span>"
        break_3 = "</span>"
        str_list = (
            wine_html.split(break_1)[1].split(break_2)[1].split(break_3)[0].split(",")
        )
        break_4 = '">'
        break_5 = "</a>"
        loc_list = []
        for loc in str_list:
            loc_list.append(loc.split(break_4)[1].split(break_5)[0])
        region, country = loc_list[-2:]
        return region, country

    def get_variety(self, wine_html):
        break_1 = '<div class="key">Variety</div>'
        break_2 = '">'
        break_3 = "</a>"
        return wine_html.split(break_1)[1].split(break_2)[2].split(break_3)[0]

    def get_type(self, wine_html):
        break_1 = '<div class="key">Wine Type</div>'
        break_2 = '">'
        break_3 = "</a>"
        return wine_html.split(break_1)[1].split(break_2)[2].split(break_3)[0]

    def get_alcohol(self, wine_html):
        break_1 = '<div class="key">Alcohol</div>'
        break_2 = '<div class="value">'
        break_3 = "</div>"
        return float(
            wine_html.split(break_1)[1]
            .split(break_2)[1]
            .split(break_3)[0]
            .strip()
            .replace("%", "")
        )

    def get_bottle_size(self, wine_html):
        break_1 = '<div class="key">Bottle Size</div>'
        break_2 = '<div class="value">'
        break_3 = "</div>"
        try:
            return float(
                wine_html.split(break_1)[1]
                .split(break_2)[1]
                .split(break_3)[0]
                .strip()
                .replace(" ml", "")
            )
        except:
            return 1000 * float(
                wine_html.split(break_1)[1]
                .split(break_2)[1]
                .split(break_3)[0]
                .strip()
                .replace(" L", "")
            )

    def get_description(self, wine_html):
        break_1 = '<div class="row review__row">'
        break_2 = '<div class="col-12">'
        break_3 = '<span class="taster">'
        return wine_html.split(break_1)[1].split(break_2)[1].split(break_3)[0].strip()

    async def parse_wine_html(self):
        results = {}
        results["title"] = self.get_wine_title(self.wine_html)
        results["year"] = self.get_year_from_title(results["title"])
        results["rating"] = self.get_rating(self.wine_html)
        results["price"] = self.get_price(self.wine_html)
        results["winery"] = self.get_winery(self.wine_html)
        location = self.get_location(self.wine_html)
        results["region"] = location[0]
        results["country"] = location[1]
        results["variety"] = self.get_variety(self.wine_html)
        results["type"] = self.get_type(self.wine_html)
        results["alcohol"] = self.get_alcohol(self.wine_html)
        results["bottle_size"] = self.get_bottle_size(self.wine_html)
        results["description"] = self.get_description(self.wine_html)
        self.wine_data = results
        return results

In [None]:
all_wines_dict = {
    "title": [],
    "year": [],
    "rating": [],
    "price": [],
    "winery": [],
    "region": [],
    "country": [],
    "variety": [],
    "type": [],
    "alcohol": [],
    "bottle_size": [],
    "description": [],
}

In [None]:
wine_parser = WineParser()

error_wines = []

i = 1

for wine in wine_list:
    url = wine[1]
    if i % 10 == 0:
        print("On wine", i)
    await wine_parser.get_html(url)
    try:
        wine_dict = await wine_parser.parse_wine_html()
        for key in all_wines_dict.keys():
            all_wines_dict[key].append(wine_dict[key])
    except:
        error_wines.append(wine)
    i += 1

try:
    await wine_parser.session.close()
except:
    pass

In [None]:
def csv_to_sql(csv_content, database, table_name):
    df = pd.read_csv(csv_content)
    df = df.iloc[:, 1:]  # to remove indexes in the first column
    db = sqlite3.connect(database)
    df.to_sql(table_name, db, if_exists="replace", index=False)

In [None]:
len(error_wines)

In [None]:
wine_df = pd.DataFrame(all_wines_dict)

In [None]:
wine_df.to_csv("all_data.csv")  # to allow for manual edits of csv

In [None]:
csv_to_sql("all_data.csv", "all_data.db", "wines")  # convert to SQL db after edits

# 3. Loading the data (start executing here after imports)

In [None]:
def load_db(db_file, table_name):
    connection = sqlite3.connect(db_file)
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql_query(query, connection)
    connection.close()
    return df

In [None]:
wine_data = load_db("all_data.db", "wines")

In [None]:
wine_data

# 4. Data exploration and visualization

## General Statistics

In [None]:
plt.figure(figsize=(6, 6))
plt.plot(wine_data["rating"], wine_data["price"], "o")
plt.xlabel("Rating")
plt.ylabel("Price ($)")
plt.title("Wine Price vs. Rating")
plt.show()

In [None]:
wine_data["price"].mean()

In [None]:
wine_data["rating"].mean()

In [None]:
df_no_nans = wine_data.dropna()
print(
    "Price-rating correlation coefficient:",
    round(np.corrcoef(df_no_nans["price"], df_no_nans["rating"])[0][1], 2),
)

In [None]:
wines_by_type = pd.DataFrame(
    dict(wine_data.groupby("type").size()).items(), columns=["type", "No. wines"]
).sort_values("No. wines", ascending=False)
plt.figure()
plt.bar(wines_by_type["type"], wines_by_type["No. wines"])
plt.xlabel("Wine Type")
plt.ylabel("No. Wines")
plt.title("Wines by Type")
plt.show()

In [None]:
out_str = "Wines by Type\nType\tNo. wines\tAverage price\tAverage rating\n"
for type_ in wine_data["type"].unique():
    wines = wine_data[wine_data["type"] == type_]
    out_str += (
        type_
        + " " * (13 - len(type_))
        + str(len(wines))
        + " " * 10
        + str(round(wines["price"].mean(), 2))
        + " " * 15
        + str(round(wines["rating"].mean(), 1))
        + "\n"
    )
print(out_str)

## Wines by Country

In [None]:
countries = list(wine_data["country"].unique())
wines_per_country = {}
price_by_country = {}
price_by_country_std = {}
rating_by_country = {}
rating_by_country_std = {}

for country in countries:
    wines = wine_data[wine_data["country"] == country]
    wines_per_country[country] = len(wines)
    price_by_country[country] = wines["price"].mean()
    price_by_country_std[country] = wines["price"].std()
    rating_by_country[country] = wines["rating"].mean()
    rating_by_country_std[country] = wines["rating"].std()

In [None]:
wines_per_country = dict(
    sorted(wines_per_country.items(), key=lambda item: item[1], reverse=True)
)
plt.bar(wines_per_country.keys(), wines_per_country.values())
plt.xticks(rotation="vertical")
plt.xlabel("Country")
plt.ylabel("No. Wines")
plt.title("Wines per Country")
plt.show()

In [None]:
for country in rating_by_country.keys():
    try:
        int(rating_by_country[country])
    except:
        print(country, "lacks rating data")
        rating_by_country[country] = 0

rating_by_country = dict(
    sorted(rating_by_country.items(), key=lambda item: item[1], reverse=True)
)

plt.bar(rating_by_country.keys(), rating_by_country.values())
plt.xticks(rotation="vertical")
plt.xlabel("Country")
plt.ylabel("Average Wine Rating")
plt.title("Wines Ratings by Country")
plt.show()

In [None]:
for country in rating_by_country.keys():
    try:
        int(price_by_country[country])
    except:
        print(country, "lacks price data")
        price_by_country[country] = 0

price_by_country = dict(
    sorted(price_by_country.items(), key=lambda item: item[1], reverse=True)
)

plt.bar(price_by_country.keys(), price_by_country.values())
plt.xticks(rotation="vertical")
plt.xlabel("Country")
plt.ylabel("Average Wine Price ($)")
plt.title("Wines Prices by Country")
plt.show()

# 5. Rank wines based on rating and price 

The idea here is to rank wines based on some combination of price and rating: The higher the rating, the higher the rank. And the lower the price, the higher the rank. One possible model for the rank or quality factor $q$ is 
$$
q = c \frac{r^a}{p^b},
$$
where $r$ and $p$ are the rating and price, and $a,\ b, \text{and}\ c$ are positive constants. This can be converted to a linear system by taking the logarithm:
$$
q' \equiv \log q = \log c + a \log r - b\log p.
$$
Now, linear regression for 
$$
q'(x, y) = c' + a x + b' y
$$
 with $c'=\log c$, $b'=-b$, $x=\log r$ and $y = \log p$ can be carried out, and the values for $a$, $b$, and $c$ can be substituded back into the model for $q$. 
 
To "train" the model, asserted values of $q$ used to rank the wines are as follows:
$$
q(r_\text{min}, p_\text{min}) = 30
$$
$$
q(r_\text{min}, p_\text{max}) = 10
$$
$$
q(r_\text{max}, p_\text{min}) = 100
$$
$$
q(r_\text{max}, p_\text{max}) = 30
$$
The effect is that in a list of wines sorted by $q$, the lowest rated wines with the highest price will be listed last, and the highest rated wines with low prices will be listed first, with other wines distributed in the middle (these values can be changed at the developer's discretion, provided they preserve the general ranking theme of prioritizing both low price and high ratings).


In [None]:
class QRank:
    def __init__(self, wine_data):
        self.data = wine_data
        self.qmin = 10
        self.qmid = 30
        self.qmax = 100
        self.set_q()
        self.fit_q()
        self.calc_q()

    def set_q(self):
        min_rating, max_rating = self.data["rating"].min(), self.data["rating"].max()
        min_price, max_price = self.data["price"].min(), self.data["price"].max()
        self.q_rating = pd.DataFrame(columns=["rating", "price", "q"])
        self.q_rating["rating"] = [min_rating, max_rating, min_rating, max_rating]
        self.q_rating["price"] = [min_price, min_price, max_price, max_price]
        self.q_rating["q"] = [self.qmid, self.qmax, self.qmin, self.qmid]

    def fit_q(self):
        X = np.log(self.q_rating[["rating", "price"]])
        y = np.log(self.q_rating["q"])
        self.regr = linear_model.LinearRegression()
        self.regr.fit(X, y)

    def calc_q(self):
        self.data.loc[:, "q"] = (
            np.exp(self.regr.intercept_)
            * self.data["rating"] ** self.regr.coef_[0]
            * self.data["price"] ** self.regr.coef_[1]
        )

    def plot_q(self):
        fig = plt.figure(figsize=(6, 6))
        ax = fig.add_subplot(111)
        ax.set_title("q(Price, Rating)", fontsize=14)
        ax.set_xlabel("Rating", fontsize=12)
        ax.set_ylabel("Price", fontsize=12)
        ax.grid(True, linestyle="-", color="0.75")
        x = self.data["rating"]
        y = self.data["price"]
        q = self.data["q"]

        # scatter with colormap mapping to z value
        ax.scatter(x, y, s=20, c=q, marker="o", cmap=cm.jet)

        plt.show()

In [None]:
wine_data = QRank(wine_data).data

Here, warmer colors signify higher rankings. These are the wines that are recommended first in a given search.

In [None]:
QRank(wine_data).plot_q()

# 6. General wine search

In [None]:
class UserSelect:
    def __init__(self, wine_data):
        self.data = wine_data
        self.print_options = True

    def filter_and_sort(self):
        self.filter_types()
        self.filter_countries()
        self.filter_regions()
        self.filter_winery()
        self.filter_rating()
        self.filter_price()
        self.get_qs()
        self.sort_data()

    def filter_types(self):
        if self.print_options:
            all_types = ", ".join(list(self.data["type"].unique()))
            print("Wine types:", all_types)
        print("Enter wine type:")
        print("To select multiple types, separate entries by a comma.")
        print("Leave blank to select all types.\n")
        wine_types = input().split(",")
        if wine_types[0] != "":
            select_data = self.data[
                self.data["type"].str.lower() == wine_types[0].strip().lower()
            ]
            for wine_type in wine_types[1:]:
                select_data = pd.concat(
                    [select_data, self.data[self.data["type"] == wine_type.strip()]]
                )
            self.data = select_data

    def filter_varieties(self):
        if self.print_options:
            all_types = ", ".join(list(self.data["variety"].unique()))
            print("Wine varieties:", all_types)
        print("Enter wine variety:")
        print("To select multiple varieties, separate entries by a comma.")
        print("Leave blank to select all varieties.\n")
        wine_varieies = input().split(",")
        if wine_varieies[0] != "":
            select_data = self.data[
                self.data["variety"].str.lower() == wine_varieies[0].strip().lower()
            ]
            for wine_variety in wine_varieies[1:]:
                select_data = pd.concat(
                    [
                        select_data,
                        self.data[self.data["variety"] == wine_variety.strip()],
                    ]
                )
            self.data = select_data

    def filter_countries(self):
        if self.print_options:
            all_countries = ", ".join(list(self.data["country"].unique()))
            print("Countries:", all_countries)
        print("Enter country:")
        print("To select multiple countries, separate entries by a comma.")
        print("Leave blank to select all countries.\n")
        countries = input().split(",")
        if countries[0] != "":
            select_data = self.data[
                self.data["country"].str.lower() == countries[0].strip().lower()
            ]
            for country in countries[1:]:
                select_data = pd.concat(
                    [select_data, self.data[self.data["country"] == country.strip()]]
                )
            self.data = select_data

    def filter_regions(self):
        if self.print_options:
            all_regions = ", ".join(list(self.data["region"].unique()))
            print("Regions:", all_regions)
        print("Enter region:")
        print("To select multiple regions, separate entries by a comma.")
        print("Leave blank to select all regions.\n")
        regions = input().split(",")
        if regions[0] != "":
            select_data = self.data[
                self.data["region"].str.lower() == regions[0].strip().lower()
            ]
            for region in regions[1:]:
                select_data = pd.concat(
                    [select_data, self.data[self.data["region"] == region.strip()]]
                )
            self.data = select_data

    def filter_winery(self):
        if self.print_options:
            all_wineries = ", ".join(list(self.data["winery"].unique()))
            print("Wineries:", all_wineries)
        print("Enter winery:")
        print("To select multiple wineries, separate entries by a comma.")
        print("Leave blank to select all wineries.\n")
        wineries = input().split(",")
        if wineries[0] != "":
            select_data = self.data[
                self.data["winery"].str.lower() == wineries[0].strip().lower()
            ]
            for winery in wineries[1:]:
                select_data = pd.concat(
                    [select_data, self.data[self.data["winery"] == winery.strip()]]
                )
            self.data = select_data

    def filter_rating(self):
        if self.print_options:
            rmin, rmax = self.data["rating"].min(), self.data["rating"].max()
            print("Ratings range from", str(rmin), "to", str(rmax))
        print("Enter minimum rating:\nLeave blank to select all ratings\n")
        rating = input().strip()
        if rating != "":
            self.data = self.data[self.data["rating"] >= float(rating)]

    def filter_price(self):
        if self.print_options:
            pmin, pmax = self.data["price"].min(), self.data["price"].max()
            print("Price range from", "$" + str(pmin), "to", "$" + str(pmax))
        print("Enter maximum price:\nLeave blank to select all prices\n")
        max_price = input().strip()
        if max_price != "":
            self.data = self.data[self.data["price"] >= float(max_price)]

    def get_qs(self):
        if "q" in self.data.columns:
            return
        self.data = QRank(self.data).data

    def sort_data(self):
        self.data = self.data.sort_values("q", ascending=False)

    def print_data(self):
        for i in range(len(self.data)):
            wine = self.data.iloc[i]
            out_str = ""
            out_str += wine.title + "\n"
            out_str += "Price: $" + str(wine.price) + "\n"
            out_str += "Rating: " + str(wine.rating) + "\n"
            out_str += "Type: " + str(wine.type) + "\n"
            out_str += "Year: " + str(wine.year) + "\n"
            out_str += "Country: " + wine.country + "\n"
            out_str += "Region: " + wine.region + "\n"
            out_str += "Variety: " + wine.variety + "\n"
            out_str += "Bottle size (mL): " + str(wine.bottle_size) + "\n"
            out_str += "Alc. (%): " + str(wine.alcohol) + "\n"
            out_str += wine.description + "\n\n"
            out_str = out_str.replace("nan", "").replace("Price: $\n", "")
            print(out_str)

# 7. Search for wines based on notes

In [None]:
class NotesSearch:
    def __init__(self, wine_data):
        self.data = wine_data
        self.descriptions = None
        self.tokens = None
        self.filtered_collection = None
        self.bigrams = None
        self.selected_notes = None

    def compute(self):
        self.get_terms()
        self.tokenize()
        self.remove_stop_words()
        self.get_unigrams()
        self.get_bigrams()

    def get_terms(self):
        descriptions = self.data["description"].tolist()
        descriptions = "\n".join(descriptions)
        self.descriptions = descriptions.lower()

    def tokenize(self):
        delimiters = [
            "\n",
            " ",
            "\t",
            ":",
            ";",
            ",",
            ".",
            "!",
            "?",
            "(",
            ")",
            "{",
            "}",
            "[",
            "]",
            '"',
            "'",
            "/",
            "|",
            "`",
            "~",
            "—",
            ">",
            "<",
            "#",
            "@",
            "$",
            "%",
            "^",
            "&",
            "*",
            "+",
            "_",
            "=",
            "≈",
        ]

        pattern = "|".join(
            re.escape(delimiter) for delimiter in delimiters
        )  # re.escape treats special chars as actual chars and not regex patterns

        tokens = list(filter(None, re.split(pattern, self.descriptions)))
        tokens = [t for t in tokens if t != "s"]
        self.tokens = tokens

    def remove_stop_words(self):
        nltk_stop_words = set(stopwords.words("english"))
        manual_stop_words = "time nicely almost one big offering combination show rounded years aged along make attractive attractively background balance balanced balancing brief say backs lift nacional aftertaste bring character comes drinkable final gives later similar touches addition adds the accessible cabernet sauvignon pinot noir but will dissident flavors undertones sips pair flavours notes aromas few more held new months brings together from now drink wine would go with the of and in to is a by that this for are was from at be as about with have it on this has been nm o s m an can because or its also were which these into out being all other most over"
        manual_stop_words = manual_stop_words.split(" ")
        stop_words = nltk_stop_words.union(manual_stop_words)
        filtered_collection = []
        for word in self.tokens:
            if word not in stop_words:
                filtered_collection.append(word)

        new_filtered = []
        for term in filtered_collection:
            try:
                float(term)
            except:
                new_filtered.append(term)

        self.filtered_collection = new_filtered
        # self.filtered_collection = filtered_collection

    def get_unigrams(self):
        tfidf = TfidfVectorizer()
        transformed = tfidf.fit_transform([" ".join(self.filtered_collection)])
        feature_names = tfidf.get_feature_names_out()

        idf = pd.DataFrame(
            transformed.T.todense(), index=feature_names, columns=["TF-IDF"]
        )
        unigram_dict = idf.to_dict()["TF-IDF"]
        unigram_dict = dict(
            sorted(unigram_dict.items(), key=lambda x: x[1], reverse=True)
        )

        unigram_stop_words = "elements accents backed extra future concentration core expression layered make sip comprised ready unite wet unite initially now round showcasing accompanied bodied additional added affords back benefit box color confidently could day first floor followed hesitation hint slightly tends drinking find made needs open peel potential structured bottling driven aroma delicious body cut easy enjoy bodies textured age good note hints solid mouthfeel shows concentrated great joined giving integrated crushed alongside bit alcohol best perfumed glass black palate red full texture finish nose well dark fine blend offers ready green structure aging give like still generous touch".split(
            " "
        )
        for word in unigram_stop_words:
            if word in unigram_dict.keys():
                del unigram_dict[word]

        self.unigrams = unigram_dict

    def get_bigrams(self):
        tfidf = TfidfVectorizer(ngram_range=(2, 2))
        transformed = tfidf.fit_transform([" ".join(self.filtered_collection)])
        feature_names = tfidf.get_feature_names_out()

        idf = pd.DataFrame(
            transformed.T.todense(), index=feature_names, columns=["TF-IDF"]
        )
        bigram_dict = idf.to_dict()["TF-IDF"]
        bigram_dict = dict(
            sorted(bigram_dict.items(), key=lambda x: x[1], reverse=True)
        )
        self.bigrams = bigram_dict

    def display_grams(self, unigram_num=30, bigram_num=15):
        # for key in list(self.unigrams.keys())[:unigram_num]:
        #     print(key)
        # for key in list(self.bigrams.keys())[:bigram_num]:
        #     print(key)
        if unigram_num > len(self.unigrams.keys()):
            unigram_num = len(self.unigrams.keys())
        uni_cols = int(unigram_num / 4)
        if bigram_num > len(self.bigrams.keys()):
            bigram_num = len(self.bigrams.keys())
        bi_cols = int(bigram_num / 4)
        unigram_str = ""
        bigram_str = ""
        for i in range(4):
            unigram_str += (
                ", ".join(list(self.unigrams.keys())[i * uni_cols : (i + 1) * uni_cols])
                + "\n"
            )
            bigram_str += (
                ", ".join(list(self.bigrams.keys())[i * bi_cols : (i + 1) * bi_cols])
                + "\n"
            )
        out_str = unigram_str + bigram_str
        print(out_str)

    def search_notes(self):
        self.compute()
        print("Select notes of wines available:")
        self.display_grams()
        selected_notes = input(
            "Enter the notes you would like to select separated by comma."
        )
        self.selected_notes = selected_notes.split(",")
        selected_wine_indices = []
        for i in range(len(self.data)):
            for notes in self.selected_notes:
                description = self.data["description"].iloc[i]
                in_description = True
                for word in notes.split(" "):
                    if word.lower().strip() not in description.lower():
                        in_description = False
                        break
                if in_description:
                    selected_wine_indices.append(i)
        if len(selected_wine_indices) == 0:
            out_str = (
                "Notes of "
                + ", ".join(self.selected_notes)
                + " produced no results.\nPlease try other selections.\n"
            )
            print(out_str)
            self.search_notes()
        self.selected_wines = self.data.iloc[selected_wine_indices]
        self.get_qs()
        self.sort_data()
        out_str = "\nWines with "
        if len(self.selected_notes) > 2:
            out_str += "{}, or {}".format(
                ", ".join(self.selected_notes[:-1]), self.selected_notes[-1]
            )
        else:
            out_str += " or ".join(self.selected_notes)
        out_str += " characteristics include:\n"
        print(out_str)
        self.print_data()

    def get_qs(self):
        if "q" in self.selected_wines.columns:
            return
        self.selected_wines = QRank(self.selected_wines).data

    def sort_data(self):
        self.selected_wines = self.selected_wines.sort_values("q", ascending=False)

    def print_data(self):
        for i in range(len(self.selected_wines)):
            wine = self.selected_wines.iloc[i]
            out_str = ""
            out_str += wine.title + "\n"
            out_str += "Price: $" + str(wine.price) + "\n"
            out_str += "Rating: " + str(wine.rating) + "\n"
            out_str += "Type: " + str(wine.type) + "\n"
            out_str += "Year: " + str(wine.year) + "\n"
            out_str += "Country: " + wine.country + "\n"
            out_str += "Region: " + wine.region + "\n"
            out_str += "Variety: " + wine.variety + "\n"
            out_str += "Bottle size (mL): " + str(wine.bottle_size) + "\n"
            out_str += "Alc. (%): " + str(wine.alcohol) + "\n"
            out_str += wine.description + "\n\n"
            out_str = out_str.replace("nan", "").replace("Price: $\n", "")
            print(out_str)

In [None]:
def visualize(frequencies, n, x_label="Frequency", y_label="Term", title=""):
    df = pd.DataFrame.from_dict(frequencies, orient="index")
    df.reset_index(inplace=True)
    df.columns = [y_label, x_label]

    plt.figure()
    sns.barplot(x=x_label, y=y_label, data=df.iloc[:n])
    plt.title(title)
    plt.show()

## General results

In [None]:
notes_search = NotesSearch(wine_data)
notes_search.compute()
visualize(notes_search.unigrams, 25, title="Top 25 Wine Unigrams")
visualize(notes_search.bigrams, 25, title="Top 25 Wine Bigrams")

## Specific results

In [None]:
red_notes = NotesSearch(wine_data[wine_data["type"] == "Red"])
red_notes.compute()
visualize(red_notes.unigrams, 25, title="Red Wine Unigrams")
visualize(red_notes.bigrams, 25, title="Red Wine Bigrams")

# 8. Wine-food pairing

In [None]:
food_categories = {
            "Appetizers": {"variety": ["Pinot Noir", "Gamay", "Sauvignon Blanc", "Viognier", "Riesling", "Albariño", "Chardonnay", "Grüner Veltliner", "Vinho Verde", "Rosé", "Portuguese White", "Chenin Blanc", "Gewürztraminer", "Barbera", "Cinsault", "Sparkling",], "type": ['Red', 'White', 'Sparkling', 'Rose']},
            "Rich Fish": {"variety": ['Viognier', 'Chardonnay', 'Sauvignon Blanc', 'White Burgundy', 'Riesling', 'Albariño', 'Pinot Blanc', 'Sémillon', 'Pinot Noir', 'Sparkling'], "type": ['White', 'Sparkling']},
            "Lean Fish": {"variety": ['Albariño', 'Sauvignon Blanc', 'Riesling', 'Pinot Grigio', 'Riesling', 'Chenin Blanc', 'Gewürztraminer', 'White Blend', 'Vermentino', 'Vinho Verde', 'Grüner Veltliner', 'Rosé'], "type": ['White', 'Rose']},
            "Shellfish (inc. oysters)": {"variety": ['Sauvignon Blanc', 'Spanish White', 'Portuguese White', 'Albariño', 'Chenin Blanc', 'Pinot Grigio', 'Vermentino', 'Italian White', 'Sparkling'], "type": ['White', 'Sparkling']},
            "Poultry": {"variety": ['Chardonnay', 'Gamay', 'Rhône-style Red Blend', 'Rosé', 'Pinot Noir', 'Grenache', 'Portuguese Red', 'Sangiovese', 'Red Blends'], "type": ['Red', 'White']},
            "Red Meat": {"variety": ['Cabernet Sauvignon', 'Syrah', 'Petit Verdot', 'Sangiovese', 'Rhône-style Red Blend', 'Bordeaux-style Red Blend', 'Merlot', 'Malbec', 'Tempranillo', 'Nebbiolo', 'Red Blends', 'Mourvèdre', 'Petite Sirah', 'Carmenère', 'Carigan'], "type": ['Red']},
            "Game Meat": {"variety": ['Malbec', 'Zinfandel', 'Merlot', 'Grenache', 'Cabernet Franc', 'Sangiovese', 'Cabernet Sauvignon', 'Syrah', 'Tempranillo', 'Rhône-style Red Blends', 'Petit Verdot'], "type": ['Red']},
            "Pasta - Red Sauce": {"variety": ['Nebbiolo', 'Sangiovese', 'Merlot', 'Italian Red', 'Barbera', 'Tempranillo'], "type": ['Red']},
            "Pasta - Other": {"variety": ['Chardonnay', 'Sauvignon Blanc', 'Grenache', 'Barbera', 'Pinot Noir', 'Pinot Blanc'], "type": ['Red','White']},
            "Pizza": {"variety": ['Sangiovese', 'Grenache', 'Zinfandel', 'Syrah', 'Barbera', 'Sauvignon Blanc', 'Albariño', 'Viognier', 'Portuguese White'], "type": ['Red', 'White', 'Rose']},
            "Vegetarian/Vegan": {"variety": ['Sauvignon Blanc', 'Pinot Noir', 'Riesling', 'Chardonnay', 'Grenache', 'Rosé', 'Spanish White', 'Rhône-style White Blend'], "type": ['Red', 'White', 'Rose']},
            "Spicy Foods": {"variety": ['Gewürztraminer', 'Riesling', 'Sparkling', 'Sémillon', 'Zinfandel', 'Grenache', 'Rosé',], "type": ['White', 'Red', 'Dessert', 'Rose']},
            "Hard Cheeses": {"variety": ['Cabernet Sauvignon', 'Cabernet Franc', 'Merlot', 'Barbera', 'Malbec'], "type": ['Red']},
            "Soft Cheeses": {"variety": ['Chardonnay', 'Sauvignon Blanc', 'Barbera', 'Pinot Noir', 'Viogner', 'Cinsault', 'Carmenère'], "type": ['Red', 'White']},
            "Desserts - Sweet": {"variety": ['Riesling', 'Madeira', 'Chenin Blanc', 'Port Blend', 'Muscat', 'Sémillon',], "type": ['White', 'Dessert', 'Port/Sherry', 'Fortified', 'Sparkling']},
            "Desserts - Fruity": {"variety": ['Chenin Blanc', 'Madeira','Riesling', 'Port Blend', 'Muscat', 'Rosé', 'Gewürztraminer',], "type": ['White', 'Dessert', 'Rose', 'Port/Sherry', 'Fortified']},
            "Salads": {"variety": ['Sauvignon Blanc', 'Pinot Grigio', 'Chardonnay', 'Grüner Veltliner', 'Vinho Verde', 'Spanish White', 'Rhône-style White Blend'], "type": ['White']},
        }

In [None]:
class FoodPairing:
    def __init__(self, wine_data):
        self.data = wine_data
        self.food_categories = food_categories
        self.types = []
        self.varieties = []
        self.paired_wines = None

    def user_select(self, print_all_wines=False):
        out_str = "Food categories:\n" + "\n".join(list(self.food_categories.keys()))
        print(out_str)
        self.categories = input(
            "Enter food categories for wine pairing separated by comma:"
        )
        self.categories = self.categories.split(",")
        i = 0
        for category in self.categories:
            in_categories = False
            for key in self.food_categories.keys():
                if category.lower().strip() in key.lower():
                    self.varieties += self.food_categories[key]["variety"]
                    self.types += self.food_categories[key]["type"]
                    self.categories[i] = key
                    in_categories = True
                    break
            i += 1
            if not in_categories:
                out_str = (
                    '"'
                    + category
                    + '" not one of the available categories.\nPlease select again.\n'
                )
                print(out_str)
                self.user_select()
        self.paired_wines = self.data[
            (self.data["type"].isin(self.types))
            & (self.data["variety"].isin(self.varieties))
        ]
        self.get_qs()
        self.sort_data()
        self.print_types_varieties(print_all_wines=print_all_wines)

    def print_types_varieties(self, print_all_wines=False):
        out_str = "\nWines that pair well with "
        if len(self.categories) > 2:
            out_str += (
                "{}, or {}".format(", ".join(self.categories[:-1]), self.categories[-1])
                + " include"
            )
        else:
            out_str += " and ".join(self.categories)
        out_str += " include "
        if len(self.types) > 2:
            out_str += "{}, and {}".format(", ".join(self.types[:-1]), self.types[-1])
        else:
            out_str += " and ".join(self.types)
        out_str += " wines of the following varieties:\n"
        if len(self.varieties) > 2:
            out_str += "{}, and {}".format(
                ", ".join(self.varieties[:-1]), self.varieties[-1]
            )
        else:
            out_str += " and ".join(self.varieties)
        if print_all_wines:
            out_str += " (see below)\n"
            print(out_str)
            self.print_wines()
        else:
            out_str += "\n"
            print(out_str)

    def get_qs(self):
        if "q" in self.paired_wines.columns:
            return
        self.paired_wines = QRank(self.paired_wines).data

    def sort_data(self):
        self.paired_wines = self.paired_wines.sort_values("q", ascending=False)

    def print_wines(self):
        for i in range(len(self.paired_wines)):
            wine = self.paired_wines.iloc[i]
            out_str = ""
            out_str += wine.title + "\n"
            out_str += "Price: $" + str(wine.price) + "\n"
            out_str += "Rating: " + str(wine.rating) + "\n"
            out_str += "Type: " + str(wine.type) + "\n"
            out_str += "Year: " + str(wine.year) + "\n"
            out_str += "Country: " + wine.country + "\n"
            out_str += "Region: " + wine.region + "\n"
            out_str += "Variety: " + wine.variety + "\n"
            out_str += "Bottle size (mL): " + str(wine.bottle_size) + "\n"
            out_str += "Alc. (%): " + str(wine.alcohol) + "\n"
            out_str += wine.description + "\n\n"
            out_str = out_str.replace("nan", "").replace("Price: $\n", "")
            print(out_str)

In [None]:
food_paired_wines = FoodPairing(wine_data)

In [None]:
food_paired_wines.user_select()

In [None]:
appetizer_wines = FoodPairing(wine_data)
appetizer_wines.user_select() # select Appetizers
appetizer_wines_notes = NotesSearch(appetizer_wines.paired_wines)
appetizer_wines_notes.compute()

## Notes for specific food category

In [None]:
visualize(appetizer_wines_notes.unigrams, 25, title="Appetizer Wine Unigrams")
visualize(appetizer_wines_notes.bigrams, 25, title="Appetizer Wine Bigrams")

# 9. Comprehensive Search

In [None]:
class SearchWines:
    def __init__(self, wine_data):
        self.data = wine_data
        # Filter based on food
        food_pairing = FoodPairing(self.data)
        food_pairing.user_select()
        self.food_filtered_wines = food_pairing.paired_wines
        # General filter
        gen_filter = UserSelect(self.food_filtered_wines)
        gen_filter.filter_and_sort()
        self.food_gen_filtered_wines = gen_filter.data
        # Filter notes
        final_filtered_wines = NotesSearch(self.food_gen_filtered_wines)
        final_filtered_wines.search_notes()
        self.final_filtered_wines = final_filtered_wines.selected_wines.dropna()

In [None]:
select_wines = SearchWines(wine_data)

In [None]:
select_wines.final_filtered_wines