# Brand Extraction

In [None]:
import pandas as pd
from re import search
from random import seed
from numpy.random import seed as np_seed
from eda import stop_words, get_only_chars, eda

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

print("PyTorch version:", torch.__version__)

## Miscellaneous

In [None]:
verbose = True
device = "cuda" if torch.cuda.is_available() else "cpu"
bos_token = "[BOS]"
eos_token = "[EOS]"
pad_token = "[PAD]"
sep_token = "[SEP]"


class TransformersDataset(Dataset):
    def __init__(self, input_ids, attention_mask):
        self.input_ids = input_ids
        self.attention_mask = attention_mask

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_ids = self.input_ids.iloc[idx]
        attention_mask = self.attention_mask.iloc[idx]
        return input_ids, attention_mask


def train(dataloader, model, optimizer, device="cuda", step_size=1, verbose=False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.train()
    optimizer.zero_grad()
    steps = 0
    train_loss = 0
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X, attention_mask=y, labels=X)
        loss = pred.loss
        train_loss += loss.item()

        # Backpropagation
        loss.backward()
        steps += 1
        if steps % step_size == 0:
            optimizer.step()
            optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            if verbose:
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    optimizer.step()
    train_loss /= num_batches
    return train_loss


def test(dataloader, model, device="cuda", verbose=False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X, attention_mask=y, labels=X)
            test_loss += pred.loss.item()
    test_loss /= num_batches
    if verbose:
        print(f"Test Error: \n Avg loss: {test_loss:>8f} \n")
    return test_loss


def learn(
    training_data,
    test_data,
    model,
    optimizer,
    batch_size=64,
    device="cuda",
    epochs=5,
    step_size=1,
    file=None,
    verbose=False,
):
    # Create data loader.
    train_dataloader = DataLoader(training_data, batch_size)
    test_dataloader = DataLoader(test_data, batch_size)

    for X, y in test_dataloader:
        if verbose:
            print("Shape of X: ", X.shape)
            print("Shape of y: ", y.shape, y.dtype)
            print(f"Using {device} device")
            print(model)
        break

    train_losses = []
    test_losses = []

    for t in range(epochs):
        if verbose:
            print(f"Epoch {t+1}\n-------------------------------")
        train_loss = train(
            train_dataloader, model, optimizer, device, step_size, verbose
        )
        test_loss = test(test_dataloader, model, device, verbose)
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        if file:
            torch.save(model, f"{file}-{t+1}.pth")
            if verbose:
                print(f"Saved PyTorch Model State to {file}-{t+1}.pth")
    if verbose:
        print("Done!")

    return train_losses, test_losses


def generate(
    test_data,
    model,
    tokenizer,
    batch_size=64,
    device="cuda",
    pat="(.*)",
    flags=0,
    verbose=False,
):
    # Create data loader.
    test_dataloader = DataLoader(test_data, batch_size)

    for X, y in test_dataloader:
        if verbose:
            print("Shape of X: ", X.shape)
            print("Shape of y: ", y.shape, y.dtype)
            print(f"Using {device} device")
            print(model)
        break

    pad_token_id = tokenizer(pad_token)["input_ids"][0]
    eos_token_id = tokenizer(eos_token)["input_ids"][0]
    generated = pd.Series(dtype=str)

    for X, y in test_dataloader:
        X, y = X.to(device), y.to(device)
        outputs = model.generate(
            X,
            attention_mask=y,
            max_length=50,
            # num_beams=5,
            # early_stopping=True,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
        )
        outputs = pd.Series(tokenizer.batch_decode(outputs)) + eos_token
        outputs = outputs.str.extract(pat, flags, False)
        generated = pd.concat([generated, outputs], ignore_index=True)

    return generated


def get_document(df, col1, col2):
    return get_input_context(df, col1) + df[col2] + eos_token


def get_input_context(df, col):
    return bos_token + df[col] + sep_token


def get_augmented_sentences(
    df,
    col1,
    col2,
    alpha_sr=0.1,
    alpha_ri=0.1,
    alpha_rs=0.1,
    p_rd=0.1,
    num_aug=9,
):
    def augment(val1, val2):
        return pd.Series(
            eda(
                val2,
                alpha_sr,
                alpha_ri,
                alpha_rs,
                p_rd,
                num_aug,
                stop_words + val1.split(),
            )
        )

    augmented_sentences = df.apply(lambda x: augment(x[col1], x[col2]), axis=1)
    augmented_sentences = augmented_sentences.transpose()
    augmented_sentences = augmented_sentences.rename(columns=df[col1].to_dict())
    augmented_sentences = augmented_sentences.melt(var_name=col1, value_name=col2)
    return augmented_sentences


def get_augmented_labels(df, col1, col2, num_aug=9):
    def augment(val1, val2):
        if isin(val2, val1):
            val3 = f" {df[col1].sample().iloc[0]} "
            val2 = f" {val2} ".replace(f" {val1} ", val3).strip()
            val1 = val3.strip()
        return pd.Series([val1, val2], index=[col1, col2])

    augmented_labels = pd.concat(
        [df.apply(lambda x: augment(x[col1], x[col2]), axis=1) for _ in range(num_aug)],
        ignore_index=True,
    )
    return augmented_labels


def isin(sentence, value):
    sentence = f" {sentence} "
    value = f" {value} "
    return value in sentence


def startswith(sentence, value):
    sentence = f"{sentence} "
    value = f"{value} "
    return sentence.startswith(value)


def get_model(tokenizer, model, device="cuda"):
    def add_special_tokens(tokenizer):
        tokenizer.add_special_tokens(
            {
                "bos_token": bos_token,
                "eos_token": eos_token,
                "sep_token": sep_token,
                "pad_token": pad_token,
            }
        )

    tokenizer = torch.load(tokenizer)
    add_special_tokens(tokenizer)

    model = torch.load(model)
    model.resize_token_embeddings(len(tokenizer))
    model = model.to(device)

    return tokenizer, model


def get_dataset(tokenizer, ser):
    def to_tensors(seq):
        return pd.Series([torch.tensor(obj) for obj in seq])

    encoded_input = tokenizer(ser.tolist(), padding=True)
    input_ids = to_tensors(encoded_input["input_ids"])
    attention_mask = to_tensors(encoded_input["attention_mask"])
    dataset = TransformersDataset(input_ids, attention_mask)
    return dataset


def delete_special_tokens(ser):
    ser = ser.str.replace(bos_token, "", regex=False)
    ser = ser.str.replace(eos_token, "", regex=False)
    ser = ser.str.replace(sep_token, "", regex=False)
    ser = ser.str.replace(pad_token, "", regex=False)
    return ser


def get_accuracy(ser1, ser2, verbose=False):
    ser1 = ser1.reset_index(drop=True)
    ser2 = ser2.reset_index(drop=True)
    size = len(ser1)
    correct = (ser1 == ser2).sum()
    correct /= size
    if verbose:
        print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}% \n")
    return correct

## Obtaining Data

Hackathon_Ideal_Data.csv is available at [Store Transaction data](https://www.kaggle.com/iamprateek/store-transaction-data).

In [None]:
df1 = pd.read_csv("../data/Hackathon_Ideal_Data.csv")
df1

## Scrubbing Data

In [None]:
df2 = df1[["MBRD", "BRD"]]
df2 = df2.rename(columns={"MBRD": "brand", "BRD": "product"})
df2["brand"] = df2["brand"].apply(get_only_chars)
df2["brand"] = df2["brand"].str.strip()
df2 = df2[df2["brand"].str.len() > 0]
df2["product"] = df2["product"].apply(get_only_chars)
df2["product"] = df2["product"].str.strip()
df2 = df2[df2["product"].str.len() > 0]
df2["document"] = get_document(df2, "product", "brand")
df2["input context"] = get_input_context(df2, "product")
df2

In [None]:
training_set1 = df2.sample(frac=0.7, random_state=1)
training_set1

In [None]:
validation_set1 = df2.drop(training_set1.index)
validation_set1 = validation_set1.sample(frac=0.7, random_state=1)
validation_set1

In [None]:
test_set1 = df2.drop(training_set1.index)
test_set1 = test_set1.drop(validation_set1.index)
test_set1

In [None]:
seed(1)
augmented_sentences = get_augmented_sentences(
    training_set1,
    "brand",
    "product",
    alpha_sr=0.1,
    alpha_ri=0.1,
    alpha_rs=0.1,
    p_rd=0.1,
    num_aug=4,
)
training_set2 = pd.concat([training_set1, augmented_sentences])
training_set2["document"] = get_document(training_set2, "product", "brand")
training_set2["input context"] = get_input_context(training_set2, "product")
training_set2

In [None]:
np_seed(1)
augmented_labels = get_augmented_labels(training_set1, "brand", "product", num_aug=4)
training_set3 = pd.concat([training_set1, augmented_labels])
training_set3["document"] = get_document(training_set3, "product", "brand")
training_set3["input context"] = get_input_context(training_set3, "product")
training_set3

In [None]:
training_set4 = pd.concat([training_set2, augmented_labels])
training_set4["document"] = get_document(training_set4, "product", "brand")
training_set4["input context"] = get_input_context(training_set4, "product")
training_set4

## Exploring Data

In [None]:
training_set1["brand"].str.split(" ").str.len().value_counts().sort_index()

In [None]:
training_set1.apply(
    lambda x: isin(x["product"], x["brand"]), axis=1
).value_counts().sort_index()

In [None]:
training_set1.apply(
    lambda x: startswith(x["product"], x["brand"]), axis=1
).value_counts().sort_index()

In [None]:
training_set1.apply(
    lambda x: isin(x["product"], x["brand"])
    and not startswith(x["product"], x["brand"]),
    axis=1,
).value_counts().sort_index()

## Modelling Data

In [None]:
tokenizer, model = get_model("../models/gpt2-tokenizer.pth", "../models/gpt2-model.pth", device)
training_data1 = get_dataset(tokenizer, training_set1["document"])
training_data2 = get_dataset(tokenizer, training_set2["document"])
training_data3 = get_dataset(tokenizer, training_set3["document"])
training_data4 = get_dataset(tokenizer, training_set4["document"])
validation_data1 = get_dataset(tokenizer, validation_set1["document"])
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
train_losses, test_losses = learn(
    training_data2,
    validation_data1,
    model,
    optimizer,
    batch_size=32,
    device=device,
    epochs=5,
    step_size=2,
    file="gpt2",
    verbose=verbose,
)

In [None]:
tokenizer, model = get_model("../models/gpt2-tokenizer.pth", "../models/gpt2-5.pth", device)
validation_data1 = get_dataset(tokenizer, validation_set1["input context"])
sep_token_re = sep_token.replace("[", "\[").replace("]", "\]")
eos_token_re = eos_token.replace("[", "\[").replace("]", "\]")
pat = f"^.*?{sep_token_re}\s*(.*?)\s*{eos_token_re}.*$"

outputs = generate(
    validation_data1,
    model,
    tokenizer,
    batch_size=32,
    device=device,
    pat=pat,
    flags=0,
    verbose=False,
)
outputs = delete_special_tokens(outputs)
outputs = outputs.replace("", " ")
outputs = outputs.apply(get_only_chars)
outputs = outputs.str.strip()

accuracy = get_accuracy(outputs, validation_set1["brand"], verbose=True)

## Interpreting Data

In [None]:
tokenizer, model = get_model("../models/gpt2-tokenizer.pth", "../models/gpt2-5.pth", device)
validation_data1 = get_dataset(tokenizer, test_set1["input context"])
sep_token_re = sep_token.replace("[", "\[").replace("]", "\]")
eos_token_re = eos_token.replace("[", "\[").replace("]", "\]")
pat = f"^.*?{sep_token_re}\s*(.*?)\s*{eos_token_re}.*$"

outputs = generate(
    validation_data1,
    model,
    tokenizer,
    batch_size=32,
    device=device,
    pat=pat,
    flags=0,
    verbose=False,
)
outputs = delete_special_tokens(outputs)
outputs = outputs.replace("", " ")
outputs = outputs.apply(get_only_chars)
outputs = outputs.str.strip()

accuracy = get_accuracy(outputs, test_set1["brand"], verbose=True)

In [None]:
outputs = test_set1["product"].str.extract("^([^ ]*) ?.*$", expand=False)
accuracy = get_accuracy(outputs, test_set1["brand"], verbose=True)