In [1]:
import glob
import pandas as pd
import os

from imblearn.over_sampling import RandomOverSampler 

from sklearn.model_selection import train_test_split

from transformers import DistilBertTokenizerFast

import torch



In [None]:
%%time
path = '../datasets/merged/'
all_files = glob.glob(os.path.join(path, "*.csv"))

df_from_each_file = (pd.read_csv(f) for f in all_files)
df = pd.concat(df_from_each_file, ignore_index=True)

In [None]:
category_list = [
    'appliances',
    'arts_crafts_and_sewing',
    'automotive',
    'baby',
    'beauty',
    'cell_phones_and_accessories',
    'clothing_shoes_and_jewelry',
    'electronics',
    'grocery_and_gourmet_food',
    'health_and_personal_care',
    'home_and_kitchen',
    'industrial_and_scientific',
    'musical_instruments',
    'office_products',
    'patio_lawn_and_garden',
    'pet_supplies',
    'software',
    'sports_and_outdoors',
    'tools_and_home_improvement',
    'toys_and_games',
    'video_games',
]

In [None]:
df['label'] = df['category'].map(lambda x: category_list.index(x))

In [None]:
df

In [None]:
df['category'].value_counts(normalize=False).plot(kind='bar');

In [None]:
# ros = RandomOverSampler(random_state=42)
X = df[['text', 'asin']]
y = df['label']
# X_res, y_res = ros.fit_resample(X, y)

In [None]:
# y_res.value_counts(normalize=False).plot(kind='bar');

In [None]:
# y_res.shape

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
#     X_res, y_res,
    X, y,
    test_size=.1, 
    random_state=42, 
    stratify=y_res)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, 
    test_size=.2, 
    random_state=42, 
    stratify=y_train_val)

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(list(X_train['text']), truncation=True, padding=True)
val_encodings = tokenizer(list(X_val['text']), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test['text']), truncation=True, padding=True)

In [None]:
class AmazonDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
%%time
train_dataset = AmazonDataset(train_encodings, y_train)
val_dataset = AmazonDataset(val_encodings, y_val)
test_dataset = AmazonDataset(test_encodings, y_test)

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
%%time
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

In [None]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()