In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import os
from google.colab import drive 
import re
import zipfile
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/STOR566_FinalProject/data')

Mounted at /content/drive


In [None]:
def txt_cleaner(string):
    string = re.sub('<[^<]+?>', '', string) #removes html
    string = re.sub(r'[^\w\s]', '', string) #removes everything but letters/numbers and whitespace
    return string.lower() #Makes all characters lowercase
    # Should we implement a method to correct mispellings too? Seems like maybe too much effort...

In [None]:
#https://github.com/neilpl24/STOR566/data/IMDB Dataset.csv.zip
#https://github.com/neilpl24/STOR566/data/Restaurant_Reviews.tsv

url_test = 'https://raw.githubusercontent.com/neilpl24/STOR566/main/data/Restaurant_Reviews.tsv'
df_test = pd.read_csv(url_test, sep = '\t')
df_test.rename(columns={'Review':'review', 'Liked':'label'}, inplace=True)
df_test["review"] = df_test["review"].apply(txt_cleaner)

In [None]:
df_test.head()

Unnamed: 0,review,label
0,wow loved this place,1
1,crust is not good,0
2,not tasty and the texture was just nasty,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1


In [None]:
url_train = "https://raw.githubusercontent.com/neilpl24/STOR566/main/data/IMDB_Dataset.csv"
df_train = pd.read_csv(url_train)

In [None]:
# Workaround for if Github link isn't working - put .zip file in your working google drive directory

# import zipfile

# zf = zipfile.ZipFile('IMDB Dataset.csv.zip') 
# df_train = pd.read_csv(zf.open('IMDB Dataset.csv'))

In [None]:
df_train["label"] = 1
for i in range(len(df_train)):
    if df_train.loc[i, "sentiment"] == "negative":
        df_train.loc[i, "label"] = 0
df_train.drop(columns = ["sentiment"], inplace=True)
df_train["review"] = df_train["review"].apply(txt_cleaner)

In [None]:
df_train.head()

Unnamed: 0,review,label
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [None]:
class TextDataset(Dataset):
    """Text dataset."""

    def __init__(self, df, transform=None):
        """
        Args:
            df (DataFrame): dataframe to work with.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        label = self.df.iloc[idx, 1]
        review = self.df.iloc[idx, 0]
        #review = np.array([review])
        #sample = {'review': review, 'label': label}

        if self.transform:
            review = self.transform(review)

        return review, label

In [None]:
# Padding dataFrame vals to length 256 - either cutting to that length or 0-padding up to it
def padding(df, pad_len):
    '''Zero Pad tokens listed under "review" in a dataset to a pad length and return new dataset'''
    new_df = df[["label"]]
    features = []
    for i in range(len(df)):
        feature = np.zeros(pad_len, dtype=int)
        for v in range(min(pad_len, len(df.loc[i, "review"]))):
            feature[v] = df.loc[i, "review"][v]
        features.append(feature)
    new_df['review'] = [features[j] for j in new_df.index]
    return new_df

In [None]:
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for batch, _ in data_iter:
        yield tokenizer(batch)

def processing_func(train_df_orig, test_df_orig):
    train_df = train_df_orig
    test_df = test_df_orig

    train_dataset = TextDataset(df = train_df)

    tokenizer = get_tokenizer('basic_english')
    train_iter = iter(train_dataset)

    vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])

    train_df["tokens"] = train_df["review"].apply(lambda x: vocab(tokenizer(x)))
    train_df.drop(["review"], axis = 1, inplace=True)
    train_df.rename(columns = {"tokens":"review"}, inplace=True)
    train_df = train_df[["review", "label"]]

    test_df["tokens"] = test_df["review"].apply(lambda x: vocab(tokenizer(x)))
    test_df.drop(["review"], axis = 1, inplace=True)
    test_df.rename(columns = {"tokens":"review"}, inplace=True)
    test_df = test_df[["review", "label"]]    

    train_df = padding(train_df, 256)
    test_df = padding(test_df, 256)

    return train_df, test_df

In [None]:
df_train_tokenized, df_test_tokenized = processing_func(df_train, df_test)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_iter = TextDataset(df = df_train_tokenized)
train_dataloader = DataLoader(train_iter, batch_size=8, shuffle=False)
test_iter = TextDataset(df_test_tokenized)
test_dataloader = DataLoader(test_iter, batch_size=8, shuffle=False)

In [None]:
next(iter(train_dataloader))

[tensor([1, 1, 1, 0, 1, 1, 1, 0]),
 tensor([[  27,    4,    1,  ...,   19,  610,    2],
         [   3,  382,  113,  ...,    0,    0,    0],
         [   9,  191,   10,  ...,    0,    0,    0],
         ...,
         [ 233,   55, 3747,  ...,    0,    0,    0],
         [   9,  246,   56,  ...,    0,    0,    0],
         [  10,  117,   12,  ...,    0,    0,    0]])]

In [None]:
for zero, batch in enumerate(train_dataloader):
    print(batch)
    break

[tensor([1, 1, 1, 0, 1, 1, 1, 0]), tensor([[  27,    4,    1,  ...,   19,  610,    2],
        [   3,  382,  113,  ...,    0,    0,    0],
        [   9,  191,   10,  ...,    0,    0,    0],
        ...,
        [ 233,   55, 3747,  ...,    0,    0,    0],
        [   9,  246,   56,  ...,    0,    0,    0],
        [  10,  117,   12,  ...,    0,    0,    0]])]


So as you can kind of see above, the way the batches seem to be coming out is as a list of (0, tensor), where the zero is just a placeholder (not sure why it's there) and tensor that contains 2 subtensors. If you index as "zero, batch in enumerate(dataloader)", during training, you'll get the batch in the tensor. The tensor contents are as follows:

1st sub-tensor: The labels for the batch; 1 is positive, 0 is negative.

2nd sub-tensor: A list of lists, where each list is a length-256 0-padded tokenized review that corresponds to the order of the labels.

This means that each observation can be represented as follows:

Observation i in batch:

reviews=batch[1]

labels=batch[0]

review_i = reviews[i]

label_i = labels[i]