In [1]:
import pandas as pd
from collections import Counter
from tqdm import tqdm
from sklearn.metrics import classification_report

import re
# import emoji
import unicodedata

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Importing libraries 

In [3]:
# %% Importing Libraries
import os
import sys
import pickle
import argparse
import time
import datetime
import random
from pathlib import Path

from collections import OrderedDict

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, balanced_accuracy_score

import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler, random_split, TensorDataset


from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

from pytorch_lightning.loggers import WandbLogger

import lightning as L
import lightning.pytorch as pl
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch.tuner.tuning import Tuner
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
# from lightning.pytorch.strategies import DeepSpeedStrategy
# from lightning.pytorch.plugins.precision import DeepSpeedPrecisionPlugin

# from deepspeed.ops.adam import DeepSpeedCPUAdam

import t5_encoder

# Custom library
sys.path.append('../process/')
# from loadData import HTClassifierDataModule
from loadData import HTContraDataModule

sys.path.append('../architectures/')
from HTClassifier import HTClassifierModel
from ContraLayer import HTContraClassifierModel

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class HTClassifierDataModule(pl.LightningDataModule):
    def __init__(self, *args, **kwargs):
        super().__init__()

        # Initialize the class attributes
        if isinstance(args, tuple) and len(args) > 0: 
            self.args = args[0]

        # Handling the padding token in distilgpt2 by substituting it with eos_token_id
        if self.args.tokenizer_name_or_path == "distilgpt2":
            self.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path, use_fast=True)
            self.tokenizer.pad_token = self.tokenizer.eos_token
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path, use_fast=True)
    
    def setup(self, stage=None):
        # Load the dataset into a pandas dataframe.
        # Load the data from a CSV file
        data_df = pd.read_csv(os.path.join(self.args.data_dir, self.args.demography + '.csv'))
        # Replacing all the numbers in the training dataset with the letter "N"
        data_df['TEXT'] = data_df['TEXT'].apply(lambda x: re.sub(r'\d', 'N', str(x)))
        text = data_df.TEXT.values.tolist()
        vendors = data_df.VENDOR.values.tolist()
        
        # Tokenizing the data with padding and truncation
        encodings = self.tokenizer(text, add_special_tokens=True, max_length=512, padding='max_length', return_token_type_ids=False, truncation=True, 
                                   return_attention_mask=True, return_tensors='pt') 
                                   
        # Convert the lists into tensors.
        input_ids = encodings['input_ids']
        attention_mask = encodings['attention_mask']
        
        # Since the vendor IDs are not the current representations of the class labels, we remap these label IDs to avoid falling into out-of-bounds problem
        vendors_dict = {}
        i = 0
        for vendor in vendors:
            if vendor not in vendors_dict.keys():
                vendors_dict[vendor] = i
                i += 1
        vendors = [vendors_dict[vendor] for vendor in vendors]
        labels = torch.tensor(vendors)
        
        # Combine the inputs into a TensorDataset.
        dataset = TensorDataset(input_ids, attention_mask, labels)
                                   
        # Getting an 0.75-0.05-0.20 split for training-val-test dataset
        train_dataset, test_dataset = random_split(dataset, [0.8, 0.2], generator=torch.Generator().manual_seed(42))
        train_dataset, val_dataset = random_split(train_dataset, [0.95, 0.05], generator=torch.Generator().manual_seed(42))
            
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.val_dataset = val_dataset

    # Returning the pytorch-lightning default training DataLoader 
    def train_dataloader(self):
        return DataLoader(self.train_dataset, sampler=RandomSampler(self.train_dataset), batch_size=self.args.batch_size) 

    # Returning the pytorch-lightning default val DataLoader 
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.args.batch_size) 
         
    # Returning the pytorch-lightning default test DataLoader 
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.args.batch_size) 

In [6]:
# Creating directories if they don't exist
# ('../pickled/embeddings').mkdir(parents=True, exist_ok=True)

# Loading data

In [7]:
class Arguments():
    def __init__(self):
        self.model_name_or_path = 'johngiorgi/declutr-small'
        self.tokenizer_name_or_path = 'johngiorgi/declutr-small'
        self.data_dir = "../data/processed/"
        self.demography = "south"
        self.temp = 0.07 # Temperature for softmax
        self.max_seq_length = 512
        self.learning_rate = 3e-5 
        self.adam_epsilon = 1e-6
        self.warmup_steps = 0
        self.dropout = 0.3
        self.weight_decay = 0.01
        self.num_train_epochs = 1
        self.gradient_accumulation_steps = 4
        self.pad_to_max_length = True
        self.batch_size = 32
        self.output_dir = '../models/text-classifier-baselines/'
        self.overwrite = True
        self.local_rank = -1
        self.no_cuda = False
        self.loss1_type = "CE"
        self.loss2_type = "SupCon-negatives"
        self.num_hard_negatives = 5
        self.nb_epochs = 40
        self.coefficient = 1.0
        self.pooling = True

args = Arguments()

seed_everything(1111)

Seed set to 1111


1111

In [8]:
dm = HTContraDataModule(file_dir="../data/processed/south.csv", tokenizer_name_or_path=args.tokenizer_name_or_path, seed=1111, train_batch_size=32, eval_batch_size=32)
dm.setup(stage="fit")

# dm = HTClassifierDataModule(args)
# dm.setup()



In [10]:
args.num_classes = pd.read_csv(os.path.join(args.data_dir, args.demography + '.csv')).VENDOR.nunique()

args.num_training_steps = len(dm.train_dataloader()) * 32
# Setting the warmup steps to 1/10th the size of training data
args.warmup_steps = int(len(dm.train_dataloader()) * 10/100)

# Loading the model

In [11]:
class HTClassifierModel(pl.LightningModule):
    def __init__(self, *args, **kwargs):
        super().__init__()
        
        self.save_hyperparameters()
        if isinstance(args, tuple) and len(args) > 0: 
            self.args = args[0]
            self.hparams.learning_rate = self.args.learning_rate
            self.hparams.eps = self.args.adam_epsilon
            self.hparams.weight_decay = self.args.weight_decay
            self.hparams.model_name_or_path = self.args.model_name_or_path
            self.hparams.num_classes = self.args.num_classes
            self.hparams.num_training_steps = self.args.num_training_steps
            self.hparams.warmup_steps = self.args.warmup_steps
        
        # freeze
        self._frozen = False

        # Handling the padding token in distilgpt2 by substituting it with eos_token_id
        if self.hparams.model_name_or_path == "distilgpt2":
            config = AutoConfig.from_pretrained(self.hparams.model_name_or_path, num_labels=self.hparams.num_classes, output_attentions=True, output_hidden_states=True)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.hparams.model_name_or_path, config=config)
            self.model.config.pad_token_id = self.model.config.eos_token_id
        else:
            config = AutoConfig.from_pretrained(self.hparams.model_name_or_path, num_labels=self.hparams.num_classes, output_attentions=True, output_hidden_states=True)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.hparams.model_name_or_path, config=config)

    def forward(self, batch):
        # The batch contains the input_ids, the input_put_mask and the labels (for training)
        input_ids = batch[0]
        input_mask = batch[1]
        labels = batch[2]

        outputs = self.model(input_ids, attention_mask=input_mask, labels=labels)
        loss = outputs[0]
        logits = outputs[1]

        return outputs, loss, logits

    def training_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class stipulates you to overwrite. This we do here, by virtue of this definition
        outputs = self(batch)  # self refers to the model, which in turn acceses the forward method
        train_loss = outputs[0]
        self.log_dict({"train_loss": train_loss, "learning_rate":self.hparams.learning_rate}, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return train_loss
        # the training_step method expects a dictionary, which should at least contain the loss

    def validation_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class  wants you to overwrite, in case you want to do validation. This we do here, by virtue of this definition.

        outputs = self(batch)
        # self refers to the model, which in turn accesses the forward method

        # Apart from the validation loss, we also want to track validation accuracy  to get an idea, what the
        # model training has achieved "in real terms".
        val_loss = outputs[0]
        logits = outputs[1]
        labels = batch[2]

        # Evaluating the performance
        predictions = torch.argmax(logits, dim=1)
        balanced_accuracy = balanced_accuracy_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), adjusted=True)
        macro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='macro')
        micro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='micro')
        weighted_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='weighted')
        
        self.log_dict({"val_loss": val_loss, 'accuracy': balanced_accuracy, 'macro-F1': macro_accuracy, 'micro-F1': micro_accuracy, 'weighted-F1':weighted_accuracy}, 
                      on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return val_loss
    
    def test_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class  wants you to overwrite, in case you want to do test. This we do here, by virtue of this definition.

        outputs = self(batch)
        # self refers to the model, which in turn accesses the forward method

        # Apart from the validation loss, we also want to track validation accuracy  to get an idea, what the
        # model training has achieved "in real terms".
        test_loss = outputs[0]
        logits = outputs[1]
        labels = batch[2]

        # Evaluating the performance
        predictions = torch.argmax(logits, dim=1)
        balanced_accuracy = balanced_accuracy_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), adjusted=True)
        macro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='macro')
        micro_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='micro')
        weighted_accuracy = f1_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), average='weighted')
        
        self.log_dict({"test_loss": test_loss, 'accuracy': balanced_accuracy, 'macro-F1': macro_accuracy, 'micro-F1': micro_accuracy, 'weighted-F1':weighted_accuracy}, 
                      on_step=True, on_epoch=True, prog_bar=True, logger=True)
    
    def predict_step(self, batch, batch_nb):
        # the training step is a (virtual) method,specified in the interface, that the pl.LightningModule
        # class  wants you to overwrite, in case you want to do validation. This we do here, by virtue of this definition.

        outputs = self(batch)
        # self refers to the model, which in turn accesses the forward method

        # Apart from the validation loss, we also want to track validation accuracy  to get an idea, what the
        # model training has achieved "in real terms".
        val_loss = outputs[0]
        logits = outputs[1]
        labels = batch[2]

        predictions = torch.argmax(logits, dim=1)
        return predictions.detach().cpu().numpy()

    def configure_optimizers(self):
        # The configure_optimizers is a (virtual) method, specified in the interface, that the
        # pl.LightningModule class wants you to overwrite.

        # In this case we define that some parameters are optimized in a different way than others. In
        # particular we single out parameters that have 'bias', 'LayerNorm.weight' in their names. For those
        # we do not use an optimization technique called weight decay.

        no_decay = ['bias', 'LayerNorm.weight']

        optimizer_grouped_parameters = [{'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay':self.hparams.weight_decay}, 
                                        {'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
        # optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.eps)
        optimizer = DeepSpeedCPUAdam(optimizer_grouped_parameters, adamw_mode=True, lr=self.hparams.learning_rate, betas=(0.9, 0.999), eps=self.hparams.eps)

        # We also use a scheduler that is supplied by transformers.
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.hparams.num_training_steps)
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}

        return [optimizer], [scheduler]

    def freeze(self) -> None:
        # freeze all layers, except the final classifier layers
        for name, param in self.model.named_parameters():
            if 'classifier' not in name:  # classifier layer
                param.requires_grad = False

        self._frozen = True

    def unfreeze(self) -> None:
        if self._frozen:
            for name, param in self.model.named_parameters():
                if 'classifier' not in name:  # classifier layer
                    param.requires_grad = True

        self._frozen = False

    def train_epoch_start(self):
        """pytorch lightning hook"""
        if self.current_epoch < self.hparams.nr_frozen_epochs:
            self.freeze()

        if self.current_epoch >= self.hparams.nr_frozen_epochs:
            self.unfreeze() 

In [12]:
args.num_classes = pd.read_csv("../data/processed/south.csv").VENDOR.nunique()
args.num_training_steps = len(dm.train_dataloader()) * 32
# Setting the warmup steps to 1/10th the size of training data
args.warmup_steps = int(len(dm.train_dataloader()) * 10/100)

In [13]:
args.emb_len = 768
args.hidden_dim = 512
args.max_seq_length = 512

In [14]:
args.nb_triplets = 1

In [16]:
model = HTContraClassifierModel.load_from_checkpoint("/workspace/persistent/HTClipper/models/grouped-and-masked/text-baselines/contra-learn/declutr-small/south/pooled/seed:1111/lr-0.0001/coeff-1.0/temp:0.1/CE-SupCon-negatives/final_model.ckpt").eval()

# model = HTClassifierModel.load_from_checkpoint("/workspace/persistent/HTClipper/models/grouped-and-masked/text-baselines/declutr-small/south/seed:1111/lr-0.0001/final_model.ckpt").eval()

  return torch.load(checkpoint_file, map_location="cpu")


# Extracting Embeddings

In [65]:
chicago_df = pd.read_csv("../data/processed/chicago.csv")
atlanta_df = pd.read_csv("../data/processed/atlanta.csv")
dallas_df = pd.read_csv("../data/processed/dallas.csv")
detroit_df = pd.read_csv("../data/processed/detroit.csv")
houston_df = pd.read_csv("../data/processed/houston.csv")
ny_df = pd.read_csv("../data/processed/ny.csv")
sf_df = pd.read_csv("../data/processed/sf.csv")
canada_df = pd.read_csv("../data/processed/canada.csv")

In [10]:
south_df = pd.read_csv("../data/processed/south.csv")
midwest_df = pd.read_csv("../data/processed/midwest.csv")
west_df = pd.read_csv("../data/processed/west.csv")
northeast_df = pd.read_csv("../data/processed/northeast.csv")

In [11]:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path)



In [12]:
def extract_embedding_of_trained_checkpoints(df, model, tokenizer, city, model_name):
    df = df[["TEXT", "VENDOR"]].drop_duplicates()
    
    # Since the vendor IDs are not the current representations of the class labels, we remap these label IDs to avoid falling into out-of-bounds problem
    vendors_dict = {}
    i = 0
    for vendor in df.VENDOR.values.tolist():
        if vendor not in vendors_dict.keys():
            vendors_dict[vendor] = i
            i += 1

    df.replace({"VENDOR": vendors_dict}, inplace=True)
    train_df, test_df = train_test_split(df, test_size=0.20, random_state=1111)
    
    embeddings, labels = extract_embeddings(train_df, model, vendors_dict)
    directory = os.path.join(os.getcwd(), "../models/pickled/embeddings/grouped-and-masked", "trained_" + model_name + "_all")
    Path(directory).mkdir(parents=True, exist_ok=True)
    
    label_filename = city + "_labels_train.pt"
    data_filename = city + "_data_train.pt"
    torch.save(embeddings, os.path.join(directory, data_filename))
    torch.save(labels, os.path.join(directory, label_filename))
    
    embeddings, labels = extract_embeddings(test_df, model, vendors_dict)
    label_filename = city + "_labels_test.pt"
    data_filename = city + "_data_test.pt"
    torch.save(embeddings, os.path.join(directory, data_filename))
    torch.save(labels, os.path.join(directory, label_filename))

def extract_embeddings(df, model, vendors_dict, device="cpu", pooling_type="mean"):
    text = df.TEXT.values.tolist()
    vendors = df.VENDOR.values.tolist()

    # Tokenizing the data with padding and truncation
    encodings = tokenizer(text, add_special_tokens=True, max_length=512, padding='max_length', return_token_type_ids=False, truncation=True, 
                               return_attention_mask=True, return_tensors='pt') 

    # Move the encodings to the device
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)
    labels = torch.tensor(vendors).to(device)

    # Combine the inputs into a TensorDataset.
    dataset = TensorDataset(input_ids, attention_mask, labels)
    test_dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

    pooled_output_list, labels_list = [], []
    
    pbar = tqdm(total=len(test_dataloader))
    with torch.no_grad():
        for batch in test_dataloader:
            attention_mask = batch[1]
            labels = batch[2]

            outputs = model(batch)

            # Extracting the output from last hidden state
            hidden_states = torch.stack(outputs[0][2])[-1]

            # Generating the pooled output
            if pooling_type == "mean":
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
                sum_embeddings = torch.sum(hidden_states * input_mask_expanded, 1)
                sum_mask = input_mask_expanded.sum(1)
                sum_mask = torch.clamp(sum_mask, min=1e-9)
                pooled_output = sum_embeddings / sum_mask
            elif pooling_type == "max":
                last_hidden_state = hidden_states
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
                last_hidden_state[input_mask_expanded == 0] = float("-inf")  # Set padding tokens to large negative value
                pooled_output = torch.max(last_hidden_state, 1)[0]
            else:
                # Mean-max pooling
                last_hidden_state = hidden_states
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
                sum_embeddings = torch.sum(hidden_states * input_mask_expanded, 1)
                sum_mask = input_mask_expanded.sum(1)
                sum_mask = torch.clamp(sum_mask, min=1e-9)
                mean_pooled_output = sum_embeddings / sum_mask
                last_hidden_state[input_mask_expanded == 0] = float("-inf")  # Set padding tokens to large negative value
                max_pooled_output = torch.max(last_hidden_state, 1)[0]
                pooled_output = torch.cat((mean_pooled_output, max_pooled_output), 1)

            pooled_output_list.append(pooled_output)
            labels_list.append(labels)
            pbar.update(1)
        pbar.close()

    # Concatenate the pooled outputs and labels into tensors
    pooled_outputs = torch.cat(pooled_output_list)
    labels = torch.cat(labels_list)
    return pooled_outputs, labels

In [13]:
extract_embedding_of_trained_checkpoints(south_df, model, tokenizer, city="south", model_name="declutr")
extract_embedding_of_trained_checkpoints(midwest_df, model, tokenizer, city="midwest", model_name="declutr")
extract_embedding_of_trained_checkpoints(west_df, model, tokenizer, city="west", model_name="declutr")
extract_embedding_of_trained_checkpoints(northeast_df, model, tokenizer, city="northeast", model_name="declutr")

100%|██████████| 353/353 [15:58<00:00,  2.72s/it]
100%|██████████| 89/89 [04:03<00:00,  2.74s/it]
100%|██████████| 215/215 [09:51<00:00,  2.75s/it]
100%|██████████| 54/54 [02:18<00:00,  2.56s/it]
100%|██████████| 82/82 [03:36<00:00,  2.64s/it]
100%|██████████| 21/21 [00:55<00:00,  2.62s/it]
100%|██████████| 65/65 [02:52<00:00,  2.66s/it]
100%|██████████| 17/17 [00:43<00:00,  2.56s/it]


In [69]:
extract_embedding_of_trained_checkpoints(chicago_df, model, tokenizer, city="chicago", model_name="declutr")
extract_embedding_of_trained_checkpoints(dallas_df, model, tokenizer, city="dallas", model_name="declutr")
extract_embedding_of_trained_checkpoints(detroit_df, model, tokenizer, city="detroit", model_name="declutr")
extract_embedding_of_trained_checkpoints(houston_df, model, tokenizer, city="houston", model_name="declutr")
extract_embedding_of_trained_checkpoints(atlanta_df, model, tokenizer, city="atlanta", model_name="declutr")
extract_embedding_of_trained_checkpoints(ny_df, model, tokenizer, city="ny", model_name="declutr")
extract_embedding_of_trained_checkpoints(sf_df, model, tokenizer, city="df", model_name="declutr")
extract_embedding_of_trained_checkpoints(canada_df, model, tokenizer, city="canada", model_name="declutr")

100%|██████████| 176/176 [07:19<00:00,  2.50s/it]
100%|██████████| 44/44 [01:47<00:00,  2.45s/it]
100%|██████████| 102/102 [04:27<00:00,  2.62s/it]
100%|██████████| 26/26 [01:12<00:00,  2.80s/it]
100%|██████████| 39/39 [01:47<00:00,  2.75s/it]
100%|██████████| 10/10 [00:26<00:00,  2.62s/it]
100%|██████████| 128/128 [05:36<00:00,  2.63s/it]
100%|██████████| 32/32 [01:22<00:00,  2.58s/it]
100%|██████████| 124/124 [05:21<00:00,  2.60s/it]
100%|██████████| 31/31 [01:21<00:00,  2.63s/it]
100%|██████████| 65/65 [02:51<00:00,  2.64s/it]
100%|██████████| 17/17 [00:41<00:00,  2.45s/it]
100%|██████████| 82/82 [03:28<00:00,  2.54s/it]
100%|██████████| 21/21 [00:51<00:00,  2.45s/it]
100%|██████████| 29/29 [01:13<00:00,  2.53s/it]
100%|██████████| 8/8 [00:17<00:00,  2.15s/it]


In [67]:
extract_embedding_of_trained_checkpoints(canada_df, model, tokenizer, city="canada", model_name="declutr")

100%|██████████| 29/29 [01:16<00:00,  2.63s/it]
100%|██████████| 8/8 [00:18<00:00,  2.27s/it]


In [56]:
extract_embedding_of_trained_checkpoints(chicago_df, model, tokenizer, city="chicago", model_name="styleEmbedding")
extract_embedding_of_trained_checkpoints(dallas_df, model, tokenizer, city="dallas", model_name="styleEmbedding")
extract_embedding_of_trained_checkpoints(detroit_df, model, tokenizer, city="detroit", model_name="styleEmbedding")
extract_embedding_of_trained_checkpoints(houston_df, model, tokenizer, city="houston", model_name="styleEmbedding")
extract_embedding_of_trained_checkpoints(atlanta_df, model, tokenizer, city="atlanta", model_name="styleEmbedding")
extract_embedding_of_trained_checkpoints(ny_df, model, tokenizer, city="ny", model_name="styleEmbedding")
extract_embedding_of_trained_checkpoints(sf_df, model, tokenizer, city="df", model_name="styleEmbedding")
# extract_embedding_of_trained_checkpoints(canada_df, model, tokenizer, city="canada", model_name="styleEmbedding")

100%|██████████| 176/176 [14:24<00:00,  4.91s/it]
100%|██████████| 44/44 [03:22<00:00,  4.60s/it]
100%|██████████| 102/102 [07:35<00:00,  4.47s/it]
100%|██████████| 26/26 [02:00<00:00,  4.65s/it]
100%|██████████| 39/39 [02:57<00:00,  4.56s/it]
100%|██████████| 10/10 [00:43<00:00,  4.39s/it]
100%|██████████| 128/128 [09:47<00:00,  4.59s/it]
100%|██████████| 32/32 [02:22<00:00,  4.45s/it]
100%|██████████| 124/124 [09:51<00:00,  4.77s/it]
100%|██████████| 31/31 [02:21<00:00,  4.55s/it]
100%|██████████| 65/65 [04:53<00:00,  4.51s/it]
100%|██████████| 17/17 [01:12<00:00,  4.25s/it]
100%|██████████| 82/82 [06:13<00:00,  4.55s/it]
100%|██████████| 21/21 [01:29<00:00,  4.25s/it]


# Loading the embeddings from an un-trained model

In [16]:
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer(args.model_name_or_path)

No sentence-transformers model found with name /root/.cache/torch/sentence_transformers/johngiorgi_declutr-small. Creating a new one with MEAN pooling.
  return torch.load(checkpoint_file, map_location="cpu")


In [18]:
args.model_name_or_path

'johngiorgi/declutr-small'

In [19]:
model.eval()

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [20]:
def extract_embedding_of_pretrained_chechpoints(df, model, city, model_name):
    df = df[["TEXT", "VENDOR"]].drop_duplicates()
    
    train_df, test_df = train_test_split(df, test_size=0.20, random_state=1111)
    embeddings = model.encode(train_df["TEXT"].to_list())
    labels = torch.tensor(train_df.VENDOR.to_list())
    assert embeddings.shape[0] == labels.shape[0]
    
    train_label_filename = "pretrained_checkpoint_" + model_name + "_" + city + "_labels_train.pt"
    train_data_filename = "pretrained_checkpoint_" + model_name  + "_" + city + "_data_train.pt"
    
    torch.save(embeddings, os.path.join("/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/pretrained_declutr", train_data_filename))
    torch.save(labels, os.path.join("/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/pretrained_declutr", train_label_filename))
    
    embeddings = model.encode(test_df["TEXT"].to_list())
    labels = torch.tensor(test_df.VENDOR.to_list())
    assert embeddings.shape[0] == labels.shape[0]
    
    train_label_filename = "pretrained_checkpoint_" + model_name + "_" + city + "_labels_test.pt"
    train_data_filename = "pretrained_checkpoint_" + model_name  + "_" + city + "_data_test.pt"
    
    torch.save(embeddings, os.path.join("/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/pretrained_declutr", train_data_filename))
    torch.save(labels, os.path.join("/workspace/persistent/HTClipper/models/pickled/embeddings/grouped-and-masked/pretrained_declutr" , train_label_filename))

In [22]:
def preprocess_text(text):
    # Normalize text
    text = unicodedata.normalize('NFKC', text)
    
    # Convert emojis to text descriptions
    text = emoji.demojize(text, delimiters=(" ", " "))
    
    # Replace or remove special characters
    # This regex removes non-ASCII characters except basic punctuation
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    # Replace sequences of whitespace with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [38]:
chicago_df = pd.read_csv("../data/processed/chicago.csv")
atlanta_df = pd.read_csv("../data/processed/atlanta.csv")
dallas_df = pd.read_csv("../data/processed/dallas.csv")
detroit_df = pd.read_csv("../data/processed/detroit.csv")
houston_df = pd.read_csv("../data/processed/houston.csv")
ny_df = pd.read_csv("../data/processed/ny.csv")
sf_df = pd.read_csv("../data/processed/sf.csv")
canada_df = pd.read_csv("../data/processed/canada.csv")

In [21]:
south_df = pd.read_csv("../data/processed/south.csv")
midwest_df = pd.read_csv("../data/processed/midwest.csv")
west_df = pd.read_csv("../data/processed/west.csv")
northeast_df = pd.read_csv("../data/processed/northeast.csv")

In [39]:
canada_df["TEXT"] = canada_df["TEXT"].apply(lambda x: preprocess_text(x))

In [41]:
extract_embedding_of_pretrained_chechpoints(chicago_df, model, "chicago", "styleEmbedding")
extract_embedding_of_pretrained_chechpoints(atlanta_df, model, "atlanta", "styleEmbedding")
extract_embedding_of_pretrained_chechpoints(dallas_df, model, "dallas", "styleEmbedding")
extract_embedding_of_pretrained_chechpoints(detroit_df, model, "detroit", "styleEmbedding")
extract_embedding_of_pretrained_chechpoints(houston_df, model, "houston", "styleEmbedding")
extract_embedding_of_pretrained_chechpoints(ny_df, model, "NY", "styleEmbedding")
extract_embedding_of_pretrained_chechpoints(sf_df, model, "SF", "styleEmbedding")

In [42]:
extract_embedding_of_pretrained_chechpoints(canada_df, model, "canada", "declutr")

In [22]:
extract_embedding_of_pretrained_chechpoints(south_df, model, "south", "declutr")
extract_embedding_of_pretrained_chechpoints(midwest_df, model, "midwest", "declutr")
extract_embedding_of_pretrained_chechpoints(west_df, model, "west", "declutr")
extract_embedding_of_pretrained_chechpoints(northeast_df, model, "northeast", "declutr")