In [24]:
import re
from collections import defaultdict

def tokenize_text(text):
    """
    Tokenizes the text and creates a frequency dictionary of tokens.
    - Splits by spaces, hyphens, and special characters.
    - Treats digits as separate tokens.
    - Removes unwanted characters.
    """
    # Create a dictionary to store token frequencies
    token_dict = defaultdict(int)

    # Define a regular expression that matches words, digits, spaces, periods, and colons
    # Everything else is considered a special character and will be skipped
    regex = r"[a-zA-Z]+|\d+|[ .:]+"

    # Find all matches based on the regex
    tokens = re.findall(regex, text)

    # Loop through the tokens
    for token in tokens:
        token.lower()
        if token == " " or token in {".", ":"}:  # Keep spaces, periods, and colons
            token_dict[token] += 1
        else:
            # If it's a digit, treat each character as a separate token
            if token.isdigit():
                for digit in token:
                    token_dict[digit] += 1
            else:
                # Otherwise, it's a word, so add it to the dictionary
                token_dict[token] += 1

    return token_dict





In [8]:
# Example usage
text = "Hello world, this is, 9 72 and- some-more text: 123."
token_dict = tokenize_text(text)
token_dict

defaultdict(int,
            {'Hello': 1,
             ' ': 8,
             'world': 1,
             'this': 1,
             'is': 1,
             '9': 1,
             '7': 1,
             '2': 2,
             'and': 1,
             'some': 1,
             'more': 1,
             'text': 1,
             ': ': 1,
             '1': 1,
             '3': 1,
             '.': 1})

In [9]:
import kagglehub
import os
import pandas as pd


# Download latest version
path = kagglehub.dataset_download("zynicide/wine-reviews")

print("Path to dataset files:", path)

files = [file for file in os.listdir(path) if os.path.isfile(os.path.join(path, file))]
print(files)
data = pd.read_csv(f"{path}/winemag-data-130k-v2.csv")

# all categories
#print(data.columns.tolist())

print("Number of rows:", data.shape[0])
print("Missing values:\n", data.isnull().sum())

Path to dataset files: /root/.cache/kagglehub/datasets/zynicide/wine-reviews/versions/4
['winemag-data-130k-v2.json', 'winemag-data_first150k.csv', 'winemag-data-130k-v2.csv']
Number of rows: 129971
Missing values:
 Unnamed: 0                   0
country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64


In [10]:
data.fillna("", inplace=True)
print("Number of rows:", data.shape[0])

missing_values = data.isnull().sum()
print(missing_values)
data["processed_text"] = (
    "wine review : " + data["country"].str.strip() + " : " + data["province"].str.strip() + " : " + data["variety"].str.strip() + " : " + data["description"].str.strip()
)


Number of rows: 129971
Unnamed: 0               0
country                  0
description              0
designation              0
points                   0
price                    0
province                 0
region_1                 0
region_2                 0
taster_name              0
taster_twitter_handle    0
title                    0
variety                  0
winery                   0
dtype: int64


In [11]:
text = " : ".join(data["processed_text"])


In [23]:
text[:1000]

"wine review : Italy : Sicily & Sardinia : White Blend : Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity. : wine review : Portugal : Douro : Portuguese Red : This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016. : wine review : US : Oregon : Pinot Gris : Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented. : wine review : US : Michigan : Riesling : Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish. : wine review : U

In [None]:
token_dict = tokenize_text(text)

In [17]:
import json
def save_vocab_to_json(token_dict, filename="vocab.json"):
    """
    Saves the vocabulary dictionary to a JSON file.
    The format will be:
    { 'vocab': [{'token': 'word', 'index': 0, 'count': 5}, ...] }
    """
    # Sort tokens by frequency in descending order
    sorted_tokens = sorted(token_dict.items(), key=lambda item: item[1], reverse=True)

    # Build the list of vocab entries with token, index, and count
    vocab_list = [{'token': token.strip(), 'index': idx, 'count': count} for idx, (token, count) in enumerate(sorted_tokens)]

    # Prepare the dictionary to be saved
    vocab_json = {'vocab': vocab_list}

    # Save to JSON file
    with open(filename, 'w') as f:
        json.dump(vocab_json, f, indent=4)
    

In [19]:
save_vocab_to_json(token_dict, "vocab.json")