In [2]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

In [4]:
train_df = pd.read_csv("../data/processed/train.csv").head(100)
train_df.head()

Unnamed: 0,Clothing ID,Age,Positive Feedback Count,Division Name,Department Name,Class Name,text,labels
0,0,25,4,General,Bottoms,Skirts,"title: 3-season skirt! [SEP] review: Adorable,...",1
1,0,39,0,General,Bottoms,Skirts,title: Very cute [SEP] review: Love the asymme...,1
2,0,42,5,General,Bottoms,Skirts,title: Beautiful! fruns small for typical reta...,1
3,0,45,9,General,Bottoms,Skirts,title: none [SEP] review: I was really pleased...,1
4,0,57,1,General,Bottoms,Skirts,"title: Unique, pretty asymmetric skirt [SEP] r...",1


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained("../models/checkpoint-375").to(device)
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [6]:
model_name = "distilbert/distilbert-base-uncased"

class EmbedDataset(Dataset):
    def __init__(self, texts: pd.core.frame.DataFrame):
        self.texts = texts
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        token = self.tokenizer(
            self.texts[idx],
            max_length=192,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {key: value.squeeze(0).to(device) for key, value in token.items()}

In [7]:
from IPython.core.debugger import Pdb

def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
    return exp_logits / exp_logits.sum(axis=-1, keepdims=True)

def predict_sentiment(dataloader):
    probabilities_list = []
    for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
        with torch.no_grad():
            outputs = model(**batch)
            logits = outputs.logits.cpu().numpy()
            # Pdb().set_trace()
            probabilities = softmax(logits)
            probabilities_list.append(probabilities[:, 1])

    probabilities_array = np.concatenate(probabilities_list, axis=0)
    train_df["probability"] = probabilities_array

In [8]:
train_dataset = EmbedDataset(train_df["text"])
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False)

In [9]:
predict_sentiment(train_dataloader)

  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
train_df.head()

Unnamed: 0,Clothing ID,Age,Positive Feedback Count,Division Name,Department Name,Class Name,text,labels,probability
0,0,25,4,General,Bottoms,Skirts,"title: 3-season skirt! [SEP] review: Adorable,...",1,0.993447
1,0,39,0,General,Bottoms,Skirts,title: Very cute [SEP] review: Love the asymme...,1,0.99457
2,0,42,5,General,Bottoms,Skirts,title: Beautiful! fruns small for typical reta...,1,0.994842
3,0,45,9,General,Bottoms,Skirts,title: none [SEP] review: I was really pleased...,1,0.876926
4,0,57,1,General,Bottoms,Skirts,"title: Unique, pretty asymmetric skirt [SEP] r...",1,0.993074


In [11]:
train_df.to_csv("../data/processed/train.csv", header=True, index=False)

In [16]:
df = pd.read_csv("../data/processed/test.csv")
df.head()

Unnamed: 0,Clothing ID,Age,Positive Feedback Count,Division Name,Department Name,Class Name,text,probability
0,0,32,0,General,Bottoms,Skirts,title: So happy i bought this skirt! [SEP] rev...,0.993913
1,0,34,0,General,Bottoms,Skirts,title: Runs small [SEP] review: Beautiful patt...,0.896281
2,0,37,0,General,Bottoms,Skirts,title: Love the comfort of thi skirt [SEP] rev...,0.994328
3,0,39,10,General,Bottoms,Skirts,title: Way too small [SEP] review: This is a b...,0.907166
4,0,39,0,General,Bottoms,Skirts,title: none [SEP] review: I usually wear a siz...,0.994283
