# Importing Libraries

In [1]:
import os
import sys
import pickle
import argparse
import time
import datetime
import random
from pathlib import Path

from collections import OrderedDict

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler, random_split, TensorDataset

from pytorch_lightning.loggers import WandbLogger

import lightning as L
import lightning.pytorch as pl
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch.tuner.tuning import Tuner
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.strategies import DeepSpeedStrategy
from lightning.pytorch.plugins.precision import DeepSpeedPrecisionPlugin

# Custom library
sys.path.append('../process/')
from loadData import HTClassifierDataModule

sys.path.append('../architectures/')
from HTClassifier import HTClassifierModel
from HTTransferClassifier import HTTransferClassifierModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os

import numpy as np
from itertools import product

from collections import Counter
from tqdm import tqdm
import numpy as np

import torch
import faiss

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Loading data

In [4]:
class Arguments():
    def __init__(self):
        self.model_name_or_path = 'johngiorgi/declutr-small'
        self.tokenizer_name_or_path = 'johngiorgi/declutr-small'
        self.data_dir = "../data/processed/TEXT/"
        self.demography = "merged"
        self.temp = 0.07 # Temperature for softmax
        self.max_seq_length = 512
        self.learning_rate = 3e-5 
        self.adam_epsilon = 1e-6
        self.warmup_steps = 0
        self.dropout = 0.3
        self.weight_decay = 0.01
        self.num_train_epochs = 1
        self.gradient_accumulation_steps = 4
        self.pad_to_max_length = True
        self.batch_size = 32
        self.output_dir = '../models/text-classifier-baselines/'
        self.overwrite = True
        self.local_rank = -1
        self.no_cuda = False

args = Arguments()

seed_everything(1111)

Global seed set to 1111


1111

In [5]:
dm = HTClassifierDataModule(args)
dm.setup()



  data_df = pd.read_csv(os.path.join(self.args.data_dir, self.args.demography + '.csv'), error_bad_lines=False, warn_bad_lines=False)


  data_df = pd.read_csv(os.path.join(self.args.data_dir, self.args.demography + '.csv'), error_bad_lines=False, warn_bad_lines=False)


In [6]:
args.num_classes = pd.read_csv(os.path.join(args.data_dir, args.demography + '.csv'), error_bad_lines=False, warn_bad_lines=False).VENDOR.nunique()

args.num_training_steps = len(dm.train_dataloader()) * 2
# Setting the warmup steps to 1/10th the size of training data
args.warmup_steps = int(len(dm.train_dataloader()) * 10/100)



  args.num_classes = pd.read_csv(os.path.join(args.data_dir, args.demography + '.csv'), error_bad_lines=False, warn_bad_lines=False).VENDOR.nunique()


  args.num_classes = pd.read_csv(os.path.join(args.data_dir, args.demography + '.csv'), error_bad_lines=False, warn_bad_lines=False).VENDOR.nunique()


# Model Architecture

In [7]:
"""
Python version: 3.9
Description: Contains model architecture of transformer-based NNs for performing multi-class classification on the Backpage advertisements.
"""

# %% Importing libraries
from sklearn.metrics import f1_score, balanced_accuracy_score

import torch
import lightning.pytorch as pl

from deepspeed.ops.adam import DeepSpeedCPUAdam

from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

import t5_encoder

class HTClassifierModel(pl.LightningModule):
    def __init__(self, *args, **kwargs):
        super().__init__()
        
        self.save_hyperparameters()
        if isinstance(args, tuple) and len(args) > 0: 
            self.args = args[0]
            self.hparams.learning_rate = self.args.learning_rate
            self.hparams.eps = self.args.adam_epsilon
            self.hparams.weight_decay = self.args.weight_decay
            self.hparams.model_name_or_path = self.args.model_name_or_path
            self.hparams.num_classes = self.args.num_classes
            self.hparams.num_training_steps = self.args.num_training_steps
            self.hparams.warmup_steps = self.args.warmup_steps
        
        # freeze
        self._frozen = False

        # Handling the padding token in distilgpt2 by substituting it with eos_token_id
        if self.hparams.model_name_or_path == "distilgpt2":
            config = AutoConfig.from_pretrained(self.hparams.model_name_or_path, num_labels=self.hparams.num_classes, output_attentions=True, output_hidden_states=True)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.hparams.model_name_or_path, config=config)
            self.model.config.pad_token_id = self.model.config.eos_token_id
        else:
            config = AutoConfig.from_pretrained(self.hparams.model_name_or_path, num_labels=self.hparams.num_classes, output_attentions=True, output_hidden_states=True)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.hparams.model_name_or_path, config=config)

    def forward(self, batch):
        # The batch contains the input_ids, the input_put_mask and the labels (for training)
        input_ids = batch[0]
        input_mask = batch[1]
        labels = batch[2]

        outputs = self.model(input_ids, attention_mask=input_mask, labels=labels)
        loss = outputs[0]
        logits = outputs[1]

        return outputs, loss, logits, labels

    def training_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class stipulates you to overwrite. This we do here, by virtue of this definition
        outputs = self(batch)  # self refers to the model, which in turn acceses the forward method
        train_loss = outputs[0]
        self.log_dict({"train_loss": train_loss, "learning_rate":self.hparams.learning_rate}, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return train_loss
        # the training_step method expects a dictionary, which should at least contain the loss

    def validation_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class  wants you to overwrite, in case you want to do validation. This we do here, by virtue of this definition.

        outputs = self(batch)
        # self refers to the model, which in turn accesses the forward method

        # Apart from the validation loss, we also want to track validation accuracy  to get an idea, what the
        # model training has achieved "in real terms".
        val_loss = outputs[0]
        logits = outputs[1]
        labels = batch[2]

        # Evaluating the performance
        predictions = torch.argmax(logits, dim=1)
        balanced_accuracy = balanced_accuracy_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), adjusted=True)
        macro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='macro')
        micro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='micro')
        weighted_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='weighted')
        
        self.log_dict({"val_loss": val_loss, 'accuracy': balanced_accuracy, 'macro-F1': macro_accuracy, 'micro-F1': micro_accuracy, 'weighted-F1':weighted_accuracy}, 
                      on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return val_loss
    
    def test_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class  wants you to overwrite, in case you want to do test. This we do here, by virtue of this definition.

        outputs = self(batch)
        # self refers to the model, which in turn accesses the forward method

        # Apart from the validation loss, we also want to track validation accuracy  to get an idea, what the
        # model training has achieved "in real terms".
        test_loss = outputs[0]
        logits = outputs[1]
        labels = batch[2]

        # Evaluating the performance
        predictions = torch.argmax(logits, dim=1)
        balanced_accuracy = balanced_accuracy_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), adjusted=True)
        macro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='macro')
        micro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='micro')
        weighted_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='weighted')
        
        self.log_dict({"test_loss": test_loss, 'accuracy': balanced_accuracy, 'macro-F1': macro_accuracy, 'micro-F1': micro_accuracy, 'weighted-F1':weighted_accuracy}, 
                      on_step=True, on_epoch=True, prog_bar=True, logger=True)
    
    def predict_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class  wants you to overwrite, in case you want to do validation. This we do here, by virtue of this definition.

        outputs = self(batch)
        # self refers to the model, which in turn accesses the forward method

        # Apart from the validation loss, we also want to track validation accuracy  to get an idea, what the
        # model training has achieved "in real terms".
        val_loss = outputs[0]
        logits = outputs[1]
        labels = batch[2]

        predictions = torch.argmax(logits, dim=1)
        return predictions.detach().cpu().numpy()

    def configure_optimizers(self):
        # The configure_optimizers is a (virtual) method, specified in the interface, that the
        # pl.LightningModule class wants you to overwrite.

        # In this case we define that some parameters are optimized in a different way than others. In
        # particular we single out parameters that have 'bias', 'LayerNorm.weight' in their names. For those
        # we do not use an optimization technique called weight decay.

        no_decay = ['bias', 'LayerNorm.weight']

        optimizer_grouped_parameters = [{'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay':self.hparams.weight_decay}, 
                                        {'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
        # optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.eps)
        optimizer = DeepSpeedCPUAdam(optimizer_grouped_parameters, adamw_mode=True, lr=self.hparams.learning_rate, betas=(0.9, 0.999), eps=self.hparams.eps)

        # We also use a scheduler that is supplied by transformers.
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.hparams.num_training_steps)
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}

        return [optimizer], [scheduler]

    def freeze(self) -> None:
        # freeze all layers, except the final classifier layers
        for name, param in self.model.named_parameters():
            if 'classifier' not in name:  # classifier layer
                param.requires_grad = False

        self._frozen = True

    def unfreeze(self) -> None:
        if self._frozen:
            for name, param in self.model.named_parameters():
                if 'classifier' not in name:  # classifier layer
                    param.requires_grad = True

        self._frozen = False

    def train_epoch_start(self):
        """pytorch lightning hook"""
        if self.current_epoch < self.hparams.nr_frozen_epochs:
            self.freeze()

        if self.current_epoch >= self.hparams.nr_frozen_epochs:
            self.unfreeze() 

In [8]:
model = HTClassifierModel(args).load_from_checkpoint("/workspace/persistent/human-trafficking/models/text-classifier-baselines/seed:1111/merged/Style-Embedding/final_model_new.pt").eval()

Some weights of the model checkpoint at johngiorgi/declutr-small were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at johngiorgi/declutr-small and are newly ini

In [9]:
model.model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

# Loading up the data

In [60]:
df = pd.read_csv("../data/processed/TEXT/merged.csv")

text = df.TEXT.values.tolist()
vendors = df.VENDOR.values.tolist()

# Since the vendor IDs are not the current representations of the class labels, we remap these label IDs to avoid falling into out-of-bounds problem
vendors_dict = {}
i = 0
for vendor in vendors:
    if vendor not in vendors_dict.keys():
        vendors_dict[vendor] = i
        i += 1
        
train_df, test_df = train_test_split(df, test_size=0.20, random_state=1111)

In [11]:
test_df = test_df[["TEXT", "VENDOR"]]

# Getting the predictions

In [12]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path)

In [18]:
def retrieve_predictions(data_test, demo="north"):
    # data_test = test_df[test_df.DEMO==demo]
    data_test.replace({"VENDOR": vendors_dict},inplace=True)

    text = data_test.TEXT.values.tolist()
    vendors = data_test.VENDOR.values.tolist()

    # Tokenizing the data with padding and truncation
    encodings = tokenizer(text, add_special_tokens=True, max_length=512, padding='max_length', return_token_type_ids=False, truncation=True, 
                               return_attention_mask=True, return_tensors='pt') 

    # Convert the lists into tensors.
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']

    labels = torch.tensor(vendors)

    # Combine the inputs into a TensorDataset.
    dataset = TensorDataset(input_ids, attention_mask, labels)
    test_dataloader = DataLoader(dataset, batch_size=32)
    
    label_list, prediction_list  = ([] for i in range(2))

    pbar = tqdm(total=len(test_dataloader))
    for batch in test_dataloader:
        outputs, _, _, labels = model(batch)
        
        test_loss = outputs[0]
        logits = outputs[1]
        # labels = batch[2]
        predictions = torch.argmax(logits, dim=1)
        label_list.append(labels.tolist())
        prediction_list.append(predictions.tolist())
        pbar.update(1)
    pbar.close()

    label_list = [item for sublist in label_list for item in sublist]
    prediction_list = [item for sublist in prediction_list for item in sublist]
    
    return label_list, prediction_list

In [19]:
label_list, prediction_list = retrieve_predictions(test_df)

100%|██████████| 548/548 [38:32<00:00,  4.22s/it]


In [20]:
with open('../pickled/label_list.pkl', 'wb') as f:
    pickle.dump(label_list, f)
    
with open('../pickled/prediction_list.pkl', 'wb') as f:
    pickle.dump(prediction_list, f)

# Finding True Positives and False Positives

In [33]:
with open('../pickled/label_list.pkl', 'rb') as handle:
    label_list = pickle.load(handle)

with open('../pickled/prediction_list.pkl', 'rb') as handle:
    prediction_list = pickle.load(handle)

In [34]:
if_match = [True if label_list[index] == prediction_list[index] else False for index, value in enumerate(label_list)]

In [35]:
test_df["MATCH"] = if_match

In [74]:
train_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

In [36]:
tp = test_df[test_df.MATCH == True]
fp = test_df[test_df.MATCH != True]

In [170]:
print(list(fp.VENDOR.unique())[:50])

[2602, 285, 4298, 1352, 1258, 3299, 4712, 4808, 412, 309, 2164, 4734, 1543, 544, 5116, 4717, 351, 381, 3275, 3773, 459, 781, 1056, 1424, 2676, 424, 3108, 0, 1025, 3256, 1082, 425, 409, 3467, 3249, 827, 1579, 3737, 2533, 1199, 1690, 432, 3195, 1435, 3310, 3087, 3301, 4616, 431, 274]


# Loading trained embeddings

In [73]:
model = "declutr"
train_embedding_filename = "trained_traindata_" + model + "_mean.pt"    
train_label_filename = "trained_trainlabels_" + model + "_mean.pt"
    
test_embedding_filename = "trained_testdata_" + model + "_mean.pt"
test_label_filename = "trained_testlabels_" + model + "_mean.pt"

train_embeddings = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", train_embedding_filename), map_location=torch.device('cpu'))
train_labels = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", train_label_filename), map_location=torch.device('cpu')).detach().numpy()

test_embeddings = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", test_embedding_filename), map_location=torch.device('cpu'))
test_labels = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", test_label_filename), map_location=torch.device('cpu')).detach().numpy()

In [78]:
res = faiss.StandardGpuResources()  # use a single GPU

dim = train_embeddings.shape[1]
nb = train_embeddings.shape[0]
nq = train_embeddings.shape[0]

index = faiss.IndexFlatIP(dim)
# make it a flat GPU index
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
gpu_index_flat.add(train_embeddings) # add vectors to the index

In [93]:
k = 25

# Generating Explainaitions

In [286]:
samples = tp[tp.VENDOR == 309].drop_duplicates()
samples

Unnamed: 0,TEXT,VENDOR,MATCH
33104,Available now NNNNNNNNNN Posters age: <AGE> • ...,309,True
33185,Your tasty caramel treat ???? Im the one and o...,309,True
33137,NNN NNN NNNN Michelle Posters age: <AGE> • Loc...,309,True
33158,NNN NNN NNNN Michelle Posters age: <AGE> • Loc...,309,True
4680,"?NNN-NNN-NNNN?""Good morning ""gentlemen come an...",309,True
...,...,...,...
33106,?Thick?Tight?Hazel Eyes?Long Brown Hair?NN DDs...,309,True
33124,Im not really one to talk you see baby I rathe...,309,True
32951,"????Bright, ambitious and highly sensual young...",309,True
32993,???? I AIM ??N PLEASE ??NEVER N TEASE???? ?? ?...,309,True


In [241]:
anchor = samples.TEXT.iloc[0]
anchor_index = test_df[test_df.TEXT == anchor].index[0]

D, I = gpu_index_flat.search(test_embeddings[anchor_index:anchor_index+1], k)

In [242]:
I

array([[12146, 57395, 25974, 24405, 12187, 48568,  5131, 26171, 46190,
        27582, 52929,  8896, 65446, 28292, 19409, 52334, 61431,  7818,
        65330,  3609, 56363,  1509,  1402, 32969, 38755]])

In [None]:
exp1 = anchor
exp2 = train_df.iloc[36442].TEXT

# Interpret transformers

In [13]:
from transformers_interpret import SequenceClassificationExplainer

In [14]:
cls_explainer = SequenceClassificationExplainer(
    model.model,
    tokenizer)

In [247]:
word_attributions = cls_explainer(exp1, class_name='LABEL_742')

In [248]:
cls_explainer.predicted_class_name

'LABEL_4310'

In [249]:
cls_explainer.visualize("../plots/vendor827_exp1.pdf")

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
742.0,LABEL_4310 (0.06),LABEL_742,0.34,#s [ SE P ] < L INK > [ SE P ] ? ? ? ? ? ? ? ? ? ? ? NEW Japanese - N uru ? ? ? ? ? ? ? ? ? ? ? N NN - NN N - NN NN ? ? ? ? ? ? ? ? ? ? ? White Plains ? ? ? ? ? ? ? ? ? ? ? In - Call - West chester esc orts - < L INK > #/s
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
742.0,LABEL_4310 (0.06),LABEL_742,0.34,#s [ SE P ] < L INK > [ SE P ] ? ? ? ? ? ? ? ? ? ? ? NEW Japanese - N uru ? ? ? ? ? ? ? ? ? ? ? N NN - NN N - NN NN ? ? ? ? ? ? ? ? ? ? ? White Plains ? ? ? ? ? ? ? ? ? ? ? In - Call - West chester esc orts - < L INK > #/s
,,,,


In [250]:
word_attributions = cls_explainer(exp2)

In [251]:
cls_explainer.visualize("../plots/vendor827_exp2.pdf")

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
4310.0,LABEL_4310 (0.65),LABEL_4310,1.89,#s [ SE P ] < L INK > [ SE P ] ? ? ? ? ? ? ? Japanese ? ? ? ? ? ? ? ? ? ? ? Hot ? ? ? ? ? ? ? ? ? ? ? Sexy ? ? ? ? ? ? ? ? ? ? Bust y ? ? ? ? ? ? ? ? ? ? Sweet ? ? ? ? ? ? ? N NN N NN N NN N ? ? ? - West chester esc orts - < L INK > #/s
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
4310.0,LABEL_4310 (0.65),LABEL_4310,1.89,#s [ SE P ] < L INK > [ SE P ] ? ? ? ? ? ? ? Japanese ? ? ? ? ? ? ? ? ? ? ? Hot ? ? ? ? ? ? ? ? ? ? ? Sexy ? ? ? ? ? ? ? ? ? ? Bust y ? ? ? ? ? ? ? ? ? ? Sweet ? ? ? ? ? ? ? N NN N NN N NN N ? ? ? - West chester esc orts - < L INK > #/s
,,,,


# Generating the POS plot

In [15]:
import re

In [16]:
def extract_word_attribution(df, vendor_id):
    attribution_list = []
    df_ = df[df.VENDOR == vendor_id]
    
    for text in df_.TEXT.to_list():
        encoded_sent = tokenizer.encode(text)
        decoded_sent = tokenizer.decode(encoded_sent)

        decoded_sent = decoded_sent.replace("<s>", "").replace("</s>", "").replace("[SEP]","")
        word_attributions = cls_explainer(decoded_sent)[1:-1]
        decoded_sent = re.findall(r'\w+|[^\w \t\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]', decoded_sent)

        returned_index = 0
        for index, token  in enumerate(decoded_sent):
            word = ""
            score = 0.0
            if token  == word_attributions[index][0]:
                word = token 
                returned_index += 1
            else:
                while len(token ) > len(word):
                    word += word_attributions[returned_index][0]
                    score += word_attributions[returned_index][1]
                    returned_index += 1
            attribution_list.append((word, score))
    
    return attribution_list

In [17]:
def convert_attribution_list_to_dict(attribution_list):
    attribution_dict = {}
    
    for index, (token, score) in enumerate(attribution_list):
        if token in attribution_dict.keys():
            attribution_dict[token] += score
        else:
            attribution_dict[token] = score
            
    attribution_dict = {k:v/np.sum(list(attribution_dict.values())) for k,v in attribution_dict.items() if v > 0}
            
    return attribution_dict

In [19]:
attribution_list_1 = extract_word_attribution(df, 11178)
attribution_list_2 = extract_word_attribution(df, 11189)

In [20]:
attribution_dict1 = convert_attribution_list_to_dict(attribution_list_1)
attribution_dict2 = convert_attribution_list_to_dict(attribution_list_2)

# Plotting data

In [21]:
import spacy
from collections import defaultdict

import heapq

import plotly
import plotly.express as px
import plotly.graph_objects as go

import plotly.io as pio
pio.renderers.default = 'iframe'

In [22]:
nlp = spacy.load("en_core_web_trf")

In [23]:
def assign_pos_to_tokens(attr_dict, k):
    value_index = 1

    pos_dict = defaultdict(list)
    for text in attr_dict.keys():
        doc = nlp(text)
        for token in doc:
            pos_dict[token.pos_].append((text, attr_dict[text]))
            
    # Sort the defaultdict and retrieve the top K entries based on the specified index
    top_k_entries = {
        key: sorted(values, key=lambda x: x[value_index], reverse=True)[:k]
        for key, values in pos_dict.items()
    }
    
    pos_dist = {}
    for key, value in pos_dict.items():
        pos_dist[key] = len(pos_dict[key])
        
    pos_dist = {k:v/np.sum(list(pos_dist.values())) for k,v in pos_dist.items()}
    
    return dict(top_k_entries), pos_dist

In [24]:
pos_dict1, pos_dist1 = assign_pos_to_tokens(attribution_dict1, k=2)
pos_dict2, pos_dist2 = assign_pos_to_tokens(attribution_dict2, k=2)

In [54]:
fig = go.Figure()
# Plotting the bar plot

fig.add_trace(go.Bar(x=list(pos_dist1.keys()), y=list(pos_dist1.values()), 
                     name="VENDOR 11178", marker_color="red", opacity=0.3))

pos_list, token_list, attr_list = ([] for i in range(3))
for pos, token_attr_pair in pos_dict1.items():
    for token, attr in token_attr_pair:
        pos_list.append(pos)
        token_list.append(token)
        attr_list.append(attr)
        
fig.add_trace(go.Scatter(x=pos_list, y=attr_list, text=token_list, 
                                 mode='text', marker_color="red", name="VENDOR 11178", 
                                 textfont={'color':"red", "size":18}))
fig.update_layout(showlegend=False)
# plotly.offline.plot(fig, filename = os.path.join("../plots/data_pos_dist.pdf"), auto_open=False)
fig.write_image("../plots/vendor_11178_pos_dist.pdf", engine="kaleido")

fig.show()

In [55]:
fig = go.Figure()
# Plotting the bar plot

fig.add_trace(go.Bar(x=list(pos_dist2.keys()), y=list(pos_dist2.values()), 
                     name="VENDOR 11189", marker_color="green", opacity=0.3))

pos_list, token_list, attr_list = ([] for i in range(3))
for pos, token_attr_pair in pos_dict2.items():
    for token, attr in token_attr_pair:
        pos_list.append(pos)
        token_list.append(token)
        attr_list.append(attr)
        
fig.add_trace(go.Scatter(x=pos_list, y=attr_list, text=token_list, 
                                 mode='text', marker_color="green", name="VENDOR 11189", 
                                 textfont={'color':"green", "size":18}))
fig.update_layout(showlegend=False)
# plotly.offline.plot(fig, filename = os.path.join("../plots/data_pos_dist.pdf"), auto_open=False)
fig.write_image("../plots/vendor_11189_pos_dist.pdf", engine="kaleido")

fig.show()

In [None]:
import plotly.graph_objects as go

# Example dictionary
my_dict = {
    'Noun': [('apple', 0.5), ('banana', 0.8), ('orange', 0.6)],
    'Verb': [('run', 0.9), ('jump', 0.7), ('swim', 0.5)],
    'Adjective': [('big', 0.3), ('small', 0.4), ('fast', 0.6)]
}

# Create the bar plot for POS density
bar_data = [
    go.Bar(
        x=list(my_dict.keys()),
        y=[len(values) for values in my_dict.values()],
        name='POS Density'
    )
]

# Create the scatter plot for word attribution
scatter_data = [
    go.Scatter(
        x=[pos_tag for pos_tag in my_dict.keys() for _ in range(len(my_dict[pos_tag]))],
        y=[attribution for values in my_dict.values() for _, attribution in values],
        mode='markers+text',
        marker=dict(size=10, color='red'),
        text=[word for values in my_dict.values() for word, _ in values],
        hovertemplate='%{text}<br>Attribution: %{y}',
        name='Word Attribution',
        textposition='top center',
        textfont=dict(size=14)
    )
]

# Create the layout for the combined plot
layout = go.Layout(
    title='POS Density and Word Attribution',
    xaxis=dict(title='POS Tags'),
    yaxis=dict(title='Attribution Score'),
    barmode='stack'
)

# Combine the bar plot and scatter plot
fig = go.Figure(data=bar_data + scatter_data, layout=layout)

# Add arrows to prevent scatter point overlap
annotations = []
for pos_tag, values in my_dict.items():
    for word, attribution in values:
        annotation = dict(
            x=pos_tag,
            y=attribution,
            xref='x',
            yref='y',
            text=word,
            showarrow=True,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=2,
            arrowcolor='black',
            ax=0,
            ay=-30
        )
        annotations.append(annotation)

fig.update_layout(annotations=annotations)

# Display the plot
fig.show()