In [1]:
import torch
from torch.utils.data import Dataset
import pandas as pd
from torch.nn.utils.rnn import pad_sequence


# Read Data

In [2]:


splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet', 'test': 'yelp_review_full/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["test"])

  from .autonotebook import tqdm as notebook_tqdm


# Tokenize data and clean data

In [3]:
def remove_punc( document  ):
    
    list_document = document.split(" ")
    cleaned_doc = []
   
    for token in list_document:
        new_word = ""
        for val in token:

            if val.isalpha():
                new_word += val.lower()               
        if len(new_word) > 1:
            cleaned_doc.append(new_word)

    
    return cleaned_doc

df_train["cleaned_text"] = df_train["text"].apply(remove_punc)

# Vocab and Lexicon

In [4]:


def get_vocab(lexicon):
    vocab = {}
    index = 0

    for word in lexicon:

        vocab[word] = index
        index+=1

    return vocab




def get_lexicon():
    lexicon = set({ })

    for document in df_train["cleaned_text"]:
        for word in document:
            lexicon.add(word)


    
    return lexicon



lexicon = get_lexicon()
vocab = get_vocab(lexicon)

# Create Model, Dataset, and Dataloader

In [5]:

import torch.nn as nn


class FFNN(nn.Module):

    def __init__(self , vocab_size , embedding_size, hidden_layer ,classes):
        super(FFNN,self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear1 = nn.Linear(embedding_size , hidden_layer)
        self.linear2 = nn.Linear( hidden_layer , classes )

    def forward(self, x):

        x = self.embeddings(x)
        x = self.linear1(x)
        x = self.linear2(x)

        return x
    


In [6]:


class customDataset( Dataset ):

    def __init__ ( self, X , Y):

        self.x = torch.tensor(X)
        self.y = torch.tensor(Y)


    def __len__( self ):

        return len(self.x)
    
    def __getitem__(self , idx):


        return self.x[idx],self.y[idx]




# OHE documents

In [56]:
def ohe_document( document ):
    ohe_doc = []
    for word in document:
        ohe_doc.append( vocab[word] )

    return torch.tensor(ohe_doc)


In [57]:
df_train["ohe_doc"] = df_train["cleaned_text"].apply(ohe_document)

# Pad documents

In [58]:


X_train = pad_sequence(df_train["ohe_doc"], batch_first=True ,padding_value=0.0) 
Y_train = torch.tensor(df_train["label"])

In [69]:
model = FFNN( len(vocab) , 100 , 100 , len(df_train["label"].unique()))