In [20]:
!pip install transformers -U

[0m

In [21]:
import pandas as pd
import numpy as np

df = pd.read_csv('../datasets/ecommerceDataset.csv', names = ['target', 'feature'])
df.head(3)

Unnamed: 0,target,feature
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...


In [22]:
df.dropna(inplace=True)

## Label mapping

- We have to map the labels to numerical values, since BERT requires numerical labels

In [23]:
label_mapping = {'Household' : 0, 'Books': 1, 'Electronics': 2, 'Clothing & Accessories': 3}
df['target'] = df['target'].replace(label_mapping)
df.head(3)

Unnamed: 0,target,feature
0,0,Paper Plane Design Framed Wall Hanging Motivat...
1,0,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,0,SAF 'UV Textured Modern Art Print Framed' Pain...


## Now we preprocess the data.

- convert_to_lowercase:  converts the text to lowercase 
- remove_whitespaces: removes unnecessary empty whitespaces from the text 
- remove_punctuations: removes punctuations but we keep the apostrophes 
- remove_html: removes html links from the text 
- remove_http: removes http links from the text 
- remove_stopwords: removing stop words since they have no impact on the classification procedure 
- text_stemmer: converting the words to their root form 
- discard_non_alpha: discarding non-alphabetic words because they create unnecessary diversions 

In [24]:
"""
Now we do Data Preprocessing.
"""
import string, re, nltk
from string import punctuation
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

regexp = RegexpTokenizer("[\w']+")

def convert_to_lowercase(text):
    return text.lower()
def remove_whitespace(text):
    return text.strip()
def remove_punctuation(text):
    punct_str = string.punctuation
    punct_str = punct_str.replace("'", "") 
    return text.translate(str.maketrans("", "", punct_str))
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)
def remove_http(text):
    http = "https?://\S+|www\.\S+" 
    pattern = r"({})".format(http) 
    return re.sub(pattern, "", text)
# Stopwords
stops = stopwords.words("english") 
addstops = ["among", "onto", "shall", "thrice", "thus", "twice", "unto", "us", "would"]
allstops = stops + addstops
def remove_stopwords(text):
    return " ".join([word for word in regexp.tokenize(text) if word not in allstops])
stemmer = PorterStemmer()
def text_stemmer(text):
    text_stem = " ".join([stemmer.stem(word) for word in regexp.tokenize(text)])
    return text_stem
def discard_non_alpha(text):
    word_list_non_alpha = [word for word in regexp.tokenize(text) if word.isalpha()]
    text_non_alpha = " ".join(word_list_non_alpha)
    return text_non_alpha

# Integration process 

- We integrate the text normalization processes in appropriate order. We also converted the text into one line and removed square brackets.

In [25]:
def text_normalizer(text):
    text = convert_to_lowercase(text)
    text = remove_whitespace(text)
    text = re.sub('\n' , '', text) 
    text = re.sub('\[.*?\]', '', text) 
    text = remove_http(text)
    text = remove_punctuation(text)
    text = remove_html(text)
    text = remove_stopwords(text)
    text = discard_non_alpha(text)
    return text

In [26]:
df['feature'] = df['feature'].apply(text_normalizer)

In [27]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification

## Initializing our BERT model

- we use the 'bert-base-uncased' model and specify the number of labels in the dataset

In [28]:
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Enabling GPU usage

- if you prefer the usage of your GPU, uncomment the following line of code

In [29]:

#model = model.to('cuda')

AssertionError: Torch not compiled with CUDA enabled

## Train-Test split

- setting the train size to 80% and test size to 20%
- setting the maximum token length to 512

In [None]:
X = list(df['feature'])
y = list(df['target'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length = 512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length = 512)

In [None]:
X_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
print(X_train_tokenized['attention_mask'][0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
len(X_train),len(X_val)

(40339, 10085)

## Creating a custom torch dataset

- the constructor takes two arguments:
    - encodings: dictionary containing the input (input_ids, attention_mask) encodings for the text data. 
    - labels: optional argument, which represents the labels associated with the text data.
- __getitem__ method takes one parameter:
    - idx: index of the retrieved item
    - method that defines how to retrieve an item from the dataset given an index
- __len__ method:
    - method that returns the öength of the dataset, which is determined by the number of elements in the input_ids key of the encodings dictionary. 

In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
train_dataset[5]

{'input_ids': tensor([  101,  8692, 17978,  2461,  4895,  5562,  2595, 17072,  3609,  4031,
          6412,  8692, 11552,  4853, 17072, 11598,  2298,  3376,  1043, 10278,
         25373, 17072,  8692, 17072,  2507,  9657, 16115,  4895, 29278, 18150,
         10880, 11084,  3096, 19699,  9013, 18718,  6625,  4929,  8650,  2159,
          6497,  3228,  4895, 29278, 18150, 10880, 11084, 17072,  2190, 10897,
          8692,  4435, 18058,  2569,  3737, 16611,  2111,  5535,  5907, 14939,
         10943,  2100,  6412,  2706,  5814, 18419,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

## Compute the standard evaluation measurements

- defined $accuracy$, $recall$, $precision$, $f1$
- delete the average parameter if the dataset has two label categories

In [None]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average= 'weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

## Define the arguments and the trainer of the model

- the arguments object specifies the hyperparameters used for the BERT text classification
- the trainer object initialized the model, arguments, train/test sets and the metrics to calculate

In [None]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    learning_rate = 1e-4,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)




## Train the model

In [None]:
trainer.train()
#288mins

[34m[1mwandb[0m: Currently logged in as: [33mmakarwuckert-1[0m ([33mmakarwu[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.11
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/Users/makarwuckert/Desktop/Bachelorarbeit/thesis_v2/bert/wandb/run-20231005_191955-6ln4liuz[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mclean-glade-2[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/makarwu/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/makarwu/huggingface/runs/6ln4liuz[0m


  0%|          | 0/2522 [00:00<?, ?it/s]

{'loss': 0.3429, 'learning_rate': 8.017446471054719e-05, 'epoch': 0.2}


KeyboardInterrupt: 

## Evaluate the model

In [None]:
trainer.evaluate()

  0%|          | 0/1261 [00:00<?, ?it/s]

<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.11813032627105713,
 'eval_accuracy': 0.9715418939018344,
 'eval_precision': 0.9715319505957817,
 'eval_recall': 0.9715418939018344,
 'eval_f1': 0.9715284197336058,
 'eval_runtime': 1036.1791,
 'eval_samples_per_second': 9.733,
 'eval_steps_per_second': 1.217,
 'epoch': 1.0}