In [7]:
import nltk
import scipy
import gensim
import datasets
import os

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def download_datasets():
    goemotion = datasets.load_dataset("go_emotions", "simplified")
    yelp = datasets.load_dataset("Yelp/yelp_review_full")
    if not os.path.exists("data"):
        os.makedirs("data")

    if not os.path.exists("data/goemotion.hf"):
        print("Saving goemotion to disk")
        goemotion.save_to_disk("data/goemotion.hf")
    else:
        print("Goemotion already exists on disk")
    if not os.path.exists("data/yelp.hf"):
        print("Saving yelp to disk")
        yelp.save_to_disk("data/yelp.hf")
    else:
        print("Yelp already exists on disk")
    
    return goemotion, yelp

if __name__ == "__main__":
    goemotion, yelp = download_datasets()
    print(goemotion.keys())
    print(yelp.keys())

Goemotion already exists on disk
Yelp already exists on disk
dict_keys(['train', 'validation', 'test'])
dict_keys(['train', 'test'])


In [22]:
goemotion = datasets.load_from_disk("data/goemotion.hf")
yelp = datasets.load_from_disk("data/yelp.hf")

goemotion_train = goemotion["train"]
goemotion_test = goemotion["test"]

yelp_train = yelp["train"]
yelp_test = yelp["test"]

print(goemotion_train[0])
print(yelp_train[0])    

{'text': "My favourite food is anything I didn't have to cook myself.", 'labels': [27], 'id': 'eebbqej'}
{'label': 4, 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank."}


In [23]:
def tokenize_datasets(goemotion, yelp, tokenizer):
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True)
    
    goemotion = goemotion.map(tokenize_function, batched=True)
    
    yelp = yelp.map(tokenize_function, batched=True)
    
    return goemotion, yelp

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
goemotion, yelp = tokenize_datasets(goemotion, yelp, tokenizer)

print(goemotion.keys())
print(yelp.keys())

Map: 100%|██████████| 43410/43410 [00:06<00:00, 6554.57 examples/s]
Map: 100%|██████████| 5426/5426 [00:00<00:00, 6970.06 examples/s]
Map: 100%|██████████| 5427/5427 [00:00<00:00, 7119.08 examples/s]
Map: 100%|██████████| 650000/650000 [01:57<00:00, 5544.14 examples/s]
Map: 100%|██████████| 50000/50000 [00:08<00:00, 5584.12 examples/s]

dict_keys(['train', 'validation', 'test'])
dict_keys(['train', 'test'])





In [25]:
print(goemotion['test'][0])

{'text': 'I’m really sorry about your situation :( Although I love the names Sapphira, Cirilla, and Scarlett!', 'labels': [25], 'id': 'eecwqtt', 'input_ids': [101, 1045, 1521, 1049, 2428, 3374, 2055, 2115, 3663, 1024, 1006, 2348, 1045, 2293, 1996, 3415, 20066, 21850, 2527, 1010, 25022, 24714, 2050, 1010, 1998, 20862, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [27]:
import os
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk

# Define constants
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 1e-3

class YelpDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        return self.fc(hidden)

def download_or_load_datasets():
    # Directory paths
    goemotion_path = "data/goemotion.hf"
    yelp_path = "data/yelp.hf"
    
    # Check if the data directory exists, create if it doesn't
    if not os.path.exists("data"):
        os.makedirs("data")
    
    # Load Yelp dataset
    if os.path.exists(yelp_path):
        print("Loading Yelp from disk")
        yelp = load_from_disk(yelp_path)
    else:
        print("Downloading Yelp dataset")
        yelp = load_dataset("Yelp/yelp_review_full")
        print("Saving Yelp to disk")
        yelp.save_to_disk(yelp_path)
    
    return yelp

def tokenize_datasets(yelp, tokenizer):
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True)
    
    # Tokenize Yelp dataset
    yelp = yelp.map(tokenize_function, batched=True)
    
    return yelp

def create_data_loader(yelp, batch_size):
    train_dataset = YelpDataset(yelp['train']['input_ids'], yelp['train']['label'])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    return train_loader

def train_model(model, data_loader, loss_fn, optimizer, device):
    model = model.to(device)
    model.train()
    
    for epoch in range(EPOCHS):
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            output = model(input_ids, torch.tensor([len(input_ids[0])] * BATCH_SIZE).to(device))
            loss = loss_fn(output, labels)
            loss.backward()
            optimizer.step()
        
        print(f'Epoch {epoch + 1}/{EPOCHS}, Loss: {loss.item()}')

if __name__ == "__main__":
    yelp = download_or_load_datasets()
    
    # Load a tokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    # Tokenize the dataset
    yelp = tokenize_datasets(yelp, tokenizer)
    
    # Create data loader
    train_loader = create_data_loader(yelp, BATCH_SIZE)
    
    # Define model
    INPUT_DIM = tokenizer.vocab_size
    HIDDEN_DIM = 256
    OUTPUT_DIM = 5
    N_LAYERS = 2
    BIDIRECTIONAL = True
    DROPOUT = 0.5
    
    model = LSTMClassifier(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
    
    # Define loss and optimizer
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    # Train the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_model(model, train_loader, loss_fn, optimizer, device)


Loading Yelp from disk


Map:  11%|█         | 72000/650000 [00:12<01:38, 5866.88 examples/s]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f229279fcd0>>
Traceback (most recent call last):
  File "/home/mhassa2s/miniconda3/envs/nlp-proj/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
Map:  22%|██▏       | 142000/650000 [00:25<01:29, 5663.50 examples/s]