In [1]:
!pip install kagglehub



In [37]:
import kagglehub
import os
import pandas as pd


path = kagglehub.dataset_download("zynicide/wine-reviews")
files = [file for file in os.listdir(path) if os.path.isfile(os.path.join(path, file))]
print(files)

data = pd.read_csv(f"{path}/winemag-data-130k-v2.csv")

data.fillna("", inplace=True)
print("Number of rows:", data.shape[0])

data["processed_text"] = (
    "wine review: " + data["country"] + " : " + data["province"] + " : " + data["variety"] + " : " + data["description"]
)

print(data["processed_text"].head())

['winemag-data-130k-v2.csv', 'winemag-data-130k-v2.json', 'winemag-data_first150k.csv']


  data.fillna("", inplace=True)


Number of rows: 129971
0    wine review: Italy : Sicily & Sardinia : White...
1    wine review: Portugal : Douro : Portuguese Red...
2    wine review: US : Oregon : Pinot Gris : Tart a...
3    wine review: US : Michigan : Riesling : Pineap...
4    wine review: US : Oregon : Pinot Noir : Much l...
Name: processed_text, dtype: object


In [39]:
%%time

from collections import Counter

def generate_ngrams(text, n):
    words = text.split()
    ngrams = zip(*[words[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

n = 3  #Trigrams
all_ngrams = []
for text in data["processed_text"]:
    all_ngrams.extend(generate_ngrams(text, n))

# Count the frequency of each n-gram
ngram_counts = Counter(all_ngrams)

# Display the most common n-grams
print(f"Top 10 most common {n}-grams:")
print(ngram_counts.most_common(10))

Top 10 most common 3-grams:
[('wine review: US', 54504), ('review: US :', 54504), ('US : California', 36247), (': California :', 36247), ('wine review: France', 22093), ('review: France :', 22093), ('wine review: Italy', 19540), ('review: Italy :', 19540), ('Red Blend :', 17394), (': Pinot Noir', 13294)]
CPU times: user 5.37 s, sys: 143 ms, total: 5.51 s
Wall time: 5.49 s


In [46]:
import random

def predict_next_word(context, ngram_counts):
    context = context.strip().split()
    if len(context) < n - 1:
        raise ValueError(f"Context must have at least {n - 1} words for an {n}-gram model.")

    # Get the last n-1 words of the context
    context_key = " ".join(context[-(n - 1):])

    # Find all n-grams that start with the context
    candidates = {ngram: count for ngram, count in ngram_counts.items() if ngram.startswith(context_key)}

    if not candidates:
        return None  # No prediction possible

    total_count = sum(candidates.values())
    probabilities = {ngram: count / total_count for ngram, count in candidates.items()}

    # Adding repoducibility
    random.seed(13)
    next_word = random.choices(
        population=[ngram.split()[-1] for ngram in candidates.keys()],
        weights=probabilities.values()
    )[0]

    return next_word

In [41]:
def generate_text(context, ngram_counts, n, num_words):
    generated_text = context
    for _ in range(num_words):
        next_word = predict_next_word(generated_text, ngram_counts)
        if not next_word:
            break  # Stop if no prediction is possible
        generated_text += " " + next_word
    return generated_text

In [47]:
%%time

context = "wine review: Italy : Tuscany"
print(context)
# Number of words to predict
num_words = 50
print(generate_text(context, ngram_counts, n, num_words))


wine review: Italy : Tuscany
wine review: Italy : Tuscany : Sangiovese : Aromas of cherry, wild strawberry, truffle, white pepper and spice flavors. The finish is clean and the wine is structured and elegant, this opens with aromas of wild berry, cassis, herbal plum and raspberry flavors. It is ripe and full of fruit concentration and depth of flavor.
CPU times: user 23.1 s, sys: 54.7 ms, total: 23.1 s
Wall time: 23.7 s


In [31]:
%%time

context = "wine review: Italy : Tuscany : Sangiovese : Aromas of cherry, wild strawberry, truffle, white pepper and spice flavors. The finish is clean and the wine is structured and elegant, this opens with aromas of wild berry, cassis, herbal plum and raspberry flavors."
print(context)
# Number of words to predict
num_words = 15
print(generate_text(context, ngram_counts, n, num_words))


wine review: Italy : Tuscany : Sangiovese : Aromas of cherry, wild strawberry, truffle, white pepper and spice flavors. The finish is clean and the wine is structured and elegant, this opens with aromas of wild berry, cassis, herbal plum and raspberry flavors.
wine review: Italy : Tuscany : Sangiovese : Aromas of cherry, wild strawberry, truffle, white pepper and spice flavors. The finish is clean and the wine is structured and elegant, this opens with aromas of wild berry, cassis, herbal plum and raspberry flavors. There's even room for a five Bordeaux-style
CPU times: user 2.8 s, sys: 13.7 ms, total: 2.82 s
Wall time: 2.81 s
