### Import libraries

In [46]:
import pandas as pd
from sklearn import preprocessing
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch import tensor
from torch import Generator
from transformers import RobertaTokenizer

print(torch. __version__)

2.0.1+cpu


### Set important variables

In [47]:
SEED = 420
LEARNING_RATE = 1e-6 # same as RoBERTa
MODEL_NAME = 'roberta-base'

### Import data

In [48]:
df = pd.read_csv('ecommerceDataset.csv', header=None, names=['Classification', 'Description'])
df.head()

Unnamed: 0,Classification,Description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


### Create the dataset class

In [51]:
class ECommerceDataset(Dataset):
    def __init__(self, dataframe):
        self.descriptions = df['Description']
        self.classifications = df['Classification']
        self.tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
    
    def __len__(self):
        return len(self.descriptions)
    
    def __getitem__(self, idx):
        inp = self.descriptions.iloc[idx]
        output = self.tokenizer.__call__(text = inp, truncation = True, return_attention_mask = True, return_token_type_ids = True) #truncates to model default, in this case 512 tokens
        input_ids = output['input_ids']
        attention_mask = output['attention_mask']
        token_type_ids = output['token_type_ids']

        return {'input_ids' : input_ids, 'attention_mask' : attention_mask, 'token_type_ids' : token_type_ids , 'class' : self.classifications.iloc[idx]}

### Create training, validation, and testing datasets

In [58]:
print(f"Old labels: {df['Classification'].unique()}")
df['Classification'] = preprocessing.LabelEncoder().fit_transform(df['Classification'])
print(f"New labels: {df['Classification'].unique()}")
complete_dataset = ECommerceDataset(df)
generator = Generator().manual_seed(SEED)

train_dataset, validation_dataset, test_dataset = random_split(complete_dataset, [0.66, 0.17, 0.17], generator=generator)

print(f"Keys: {list(complete_dataset.__getitem__(0).keys())}")

Old labels: [3 0 1 2]
New labels: [3 0 1 2]
Keys: ['input_ids', 'attention_mask', 'token_type_ids', 'class']


### Create model class

In [53]:
class ClassificationModel(torch.nn.Module):
    def __init__(self):
        super(ClassificationModel, self).__init__()

        self.roberta = RobertaModel.from_pretrained(MODEL_NAME)
        self.l1 = torch.nn.LazyLinear()

        