In [1]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

Downloading readme:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 299M/299M [00:01<00:00, 158MB/s]  
Downloading data: 100%|██████████| 23.5M/23.5M [00:00<00:00, 120MB/s] 


Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [12]:
import pandas as pd

train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])
df = pd.concat([train_df, test_df])
df = df.reset_index(drop=True)

In [13]:
df

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...
...,...,...
699995,0,Just wanted to write a review to chip in with ...
699996,4,Great ambience. Great drinks. Great food. I lo...
699997,3,I have been to the other Monks locations so I ...
699998,1,Don't go here. I know you might want to try i...


In [14]:
# dictionary = 
# { 
#     word: (
#         number of docs containing word ,
#         {
#             doc_id: [position_list]
#         },
#         number of times word appears in all docs      
#     )
# }

dictionary = {}

In [15]:
import re

def apply_preprocessing(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Replace whitespace with a single space
    text = re.sub(r'\s+', ' ', text)
    
    return text

In [16]:
len_df = len(df)
for id, row in df.iterrows():
    print(f'Processing progress: {id+1}/{len_df} ({(id+1)/len_df*100:.2f}%)', end='\r')
    text = apply_preprocessing(row['text'])

    # Split the text into words and iterate over them
    for index, word in enumerate(text.split()):

        # If the word is already in the dictionary, update the dictionary
        if word in dictionary:
            if id in dictionary[word][1]:
                dictionary[word][1][id].append(index)   # Update the position list
            else:
                dictionary[word][1][id] = [index]       # Create the position list
                dictionary[word][0] += 1            # Update the number of docs containing the word
            dictionary[word][2] += 1                # Update the number of times the word appears in all docs
        
        # If the word is not in the dictionary, add it to the dictionary
        else:
            dictionary[word] = [1, {id: [index]}, 1]    # Create the posting list

Processing progress: 700000/700000 (100.00%)

In [19]:
# Save the dictionary to a csv file
pd.DataFrame.from_dict(dictionary, orient='index').to_csv('dictionary.csv')

In [20]:
import pickle

# Save the dictionary to a pickle file
with open('dictionary.pkl', 'wb') as f:
    pickle.dump(dictionary, f)

In [21]:
!zip -r dictionary.zip /kaggle/working

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/dictionary.csv (deflated 67%)
  adding: kaggle/working/dictionary.pkl (deflated 52%)
  adding: kaggle/working/.virtual_documents/ (stored 0%)


In [29]:
# get top 100 words by frequency
top_words = sorted(dictionary.items(), key=lambda x: x[1][2], reverse=True)[:100]
top_words = [(word[0], word[1][2]) for word in top_words]
top_words

[('the', 4693110),
 ('and', 2980819),
 ('a', 2414048),
 ('i', 2404391),
 ('to', 2322219),
 ('was', 1702865),
 ('of', 1417451),
 ('it', 1225561),
 ('for', 1096790),
 ('in', 1060934),
 ('is', 1057866),
 ('that', 878413),
 ('my', 801615),
 ('but', 774949),
 ('we', 731080),
 ('this', 720109),
 ('with', 719902),
 ('they', 689000),
 ('you', 665555),
 ('on', 663149),
 ('not', 636108),
 ('have', 581163),
 ('had', 545925),
 ('were', 528637),
 ('at', 523133),
 ('so', 473848),
 ('are', 430036),
 ('food', 410955),
 ('be', 410047),
 ('good', 406599),
 ('place', 400718),
 ('there', 376833),
 ('as', 375842),
 ('me', 356557),
 ('like', 336812),
 ('just', 331939),
 ('if', 330403),
 ('out', 324986),
 ('all', 319060),
 ('very', 299378),
 ('our', 298167),
 ('get', 294333),
 ('here', 292046),
 ('one', 287518),
 ('its', 286411),
 ('or', 267757),
 ('when', 263244),
 ('would', 260465),
 ('from', 260232),
 ('time', 259559),
 ('up', 256568),
 ('great', 255719),
 ('service', 248907),
 ('their', 243709),
 ('about