In [None]:
!pip install transformers



### Necessary packages

In [None]:
import torch 
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader,SubsetRandomSampler
import torch.optim as optim

import os
import copy
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pylab import rcParams
import csv
import time
from tqdm import tqdm
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from transformers import AutoTokenizer,AutoModel,AutoModelForSequenceClassification,AutoConfig,AdamW,get_linear_schedule_with_warmup

seed_val = 42 
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Running on gpu",torch.cuda.get_device_name(0))
else:
    device = 'cpu'
    print('No GPU found Running on cpu')

Running on gpu Tesla P100-PCIE-16GB


In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset_dir = "/content/drive/MyDrive/AmazonMLChallenge/dataset"

train_df = pd.read_csv(dataset_dir+"/train.csv",escapechar="\\",quoting=csv.QUOTE_NONE)
test_df = pd.read_csv(dataset_dir+"/test.csv",escapechar="\\",quoting=csv.QUOTE_NONE)
sample_df = pd.read_csv(dataset_dir+"/sample_submission.csv",escapechar="\\",quoting=csv.QUOTE_NONE)

In [None]:
train_df['TITLE'].isnull().sum()

71

In [None]:
len(train_df)-len(train_df.drop_duplicates())

101494

In [None]:
train_df.shape

(2903024, 5)

In [None]:
train_df = train_df.drop_duplicates()
train_df.shape

(2801530, 5)

In [None]:
train_df = train_df[train_df['TITLE'].notnull()]

In [None]:
train_df['TITLE'].isnull().sum()

0

In [None]:
le = LabelEncoder()
train_df['BROWSE_NODE_ID'] = le.fit_transform(train_df['BROWSE_NODE_ID'])
train_df['BROWSE_NODE_ID'].max()

9918

In [None]:
sentences = train_df['TITLE'].values 
labels = train_df['BROWSE_NODE_ID'].values

In [None]:
print(sentences.shape,labels.shape)

(2801467,) (2801467,)


In [None]:
train_sentences,val_sentences,train_labels,val_labels = train_test_split(sentences,labels,test_size = 0.1,random_state=seed_val)

In [None]:
print(f"No. of training sentences {len(train_sentences)}")
print(f"No. of validation sentences {len(val_sentences)}")

No. of training sentences 2521320
No. of validation sentences 280147


In [None]:
train_df.memory_usage(deep= True)*(1e-6)

Index               22.411736
TITLE              410.878601
DESCRIPTION       1593.625925
BULLET_POINTS     1398.700015
BRAND              183.974600
BROWSE_NODE_ID      22.411736
dtype: float64

In [None]:
# indices , cnts = np.unique(labels,return_counts=True)

In [None]:
# sns.countplot(y = cnts[ (cnts >=10) & (cnts <=100)] )

In [None]:
model_name = 'xlm-roberta-base'
max_input_length = 128
batch_size = 64 

### Data Preprocessing

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
idx = 1000
sample_text = sentences[idx]
tokens =tokenizer.tokenize(sample_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Sample text {}'.format(sample_text))
print('Tokens {}'.format(tokens))
print('Token IDS {}'.format(token_ids))

Sample text Generic Ladies 3 x 4 Hooks Adjustable Bra Back Extenders Multicolor 5Pcs
Tokens ['▁Gener', 'ic', '▁Ladies', '▁3', '▁x', '▁4', '▁Hoo', 'ks', '▁Ad', 'just', 'able', '▁Bra', '▁Back', '▁Ex', 'tender', 's', '▁Multi', 'color', '▁5', 'P', 'cs']
Token IDS [88342, 1771, 190387, 138, 1022, 201, 39016, 1224, 3145, 20314, 2886, 6163, 26828, 5443, 132297, 7, 19335, 46133, 190, 683, 4439]


In [None]:
tokenizer.sep_token,tokenizer.sep_token_id

('</s>', 2)

In [None]:
tokenizer.cls_token,tokenizer.cls_token_id

('<s>', 0)

In [None]:
tokenizer.pad_token,tokenizer.pad_token_id

('<pad>', 1)

In [None]:
tokenizer.unk_token,tokenizer.unk_token_id

('<unk>', 3)

In [None]:
encoding = tokenizer.encode_plus(
    sample_text,
    max_length = max_input_length,
    add_special_tokens = True,
    pad_to_max_length=True,
    return_attention_mask = True,
    return_token_type_ids = False,
    return_tensors = 'pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
encoding

{'input_ids': tensor([[     0,  88342,   1771, 190387,    138,   1022,    201,  39016,   1224,
           3145,  20314,   2886,   6163,  26828,   5443, 132297,      7,  19335,
          46133,    190,    683,   4439,      2,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              

In [None]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
base_model = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
base_model(**encoding)

BaseModelOutputWithPoolingAndCrossAttentions([('last_hidden_state',
                                               tensor([[[ 0.0558,  0.0823,  0.0636,  ..., -0.0622,  0.0629,  0.0133],
                                                        [-0.1108,  0.0240, -0.0389,  ..., -0.1025, -0.0662,  0.2437],
                                                        [-0.0655,  0.0155,  0.0229,  ...,  0.2816, -0.0189,  0.1501],
                                                        ...,
                                                        [-0.0231,  0.0338,  0.0545,  ..., -0.0503, -0.0094, -0.0327],
                                                        [-0.0231,  0.0338,  0.0545,  ..., -0.0503, -0.0094, -0.0327],
                                                        [-0.0231,  0.0338,  0.0545,  ..., -0.0503, -0.0094, -0.0327]]],
                                                      grad_fn=<NativeLayerNormBackward>)),
                                              ('pooler_output',
      

In [None]:
config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name,
                                    num_labels=len(np.unique(labels)))

print('Config type:', str(type(config)), '\n')



# Load the model from the transformers library using AutoModelForSequenceClassification"

# Load the pre-trained model for classification, passing in the `config` from above.
model = AutoModelForSequenceClassification.from_pretrained(
                                            pretrained_model_name_or_path=model_name,
                                            config = config
                                        )

Config type: <class 'transformers.models.xlm_roberta.configuration_xlm_roberta.XLMRobertaConfig'> 



Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

## Choosing token length

In [None]:
print(model(**encoding))

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0262, -0.0387,  0.4675,  ..., -0.1051, -0.1080,  0.0812]],
       grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)


In [None]:
# token_lens = []
# for txt in sentences:
#     tokens = tokenizer.encode(txt,max_length=512)
#     token_lens.append(len(tokens))

In [None]:
# sns.displot(token_lens)

In [None]:
class AmazonDataset(Dataset):

  def __init__(self, sentences, labels, tokenizer, max_length,with_labels=True):
    self.sentences = sentences
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.with_labels = with_labels
  
  def __len__(self):
    return len(self.sentences)
  
  def __getitem__(self, idx):
    sentence = str(self.sentences[idx])
    encoding = self.tokenizer.encode_plus(
      sentence,
      add_special_tokens=True,
      max_length=self.max_length,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    if self.with_labels:
        
        label = self.labels[idx]

        return {
            'sentence': sentence,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
    else:
        return {
            'sentence': sentence,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

In [None]:
def create_data_loaders(sentences,labels,tokenizer,max_input_length,batch_size,with_labels):
    ds = AmazonDataset(
        sentences =sentences,
        labels=labels,
        tokenizer=tokenizer,
        max_length=max_input_length,
        with_labels = with_labels
    )

    return DataLoader(
        ds,
        batch_size=batch_size
    )

In [None]:
train_loader = create_data_loaders(
    train_sentences,
    train_labels,
    tokenizer,
    max_input_length=max_input_length,
    batch_size=batch_size,
    with_labels = True
)

val_loader = create_data_loaders(
    val_sentences,
    val_labels,
    tokenizer,
    max_input_length=max_input_length,
    batch_size=batch_size,
    with_labels = True
)

In [None]:
class AmazonClassifier(nn.Module):

  def __init__(self,base_model_name, n_classes):
    super(AmazonClassifier, self).__init__()
    self.base_model = AutoModel.from_pretrained(base_model_name)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.base_model.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    pooled_output = self.base_model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )['last_hidden_state']
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
# from transformers import AutoConfig
# config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name,num_labels = len(np.unique(labels)))

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name,config = config)

In [None]:
# model = AmazonClassifier(base_model_name=model_name,n_classes=len(np.unique(labels)))

In [None]:
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [None]:
num_epochs = 4

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_loader) * num_epochs

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)


In [None]:
import numpy as np
def check_accuracy(predictions,true_labels):
    """
    Used for checking accuracy across each epoch
    """
    # Combine the results across the batches.
    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)

    # Choose the label with the highest score as our prediction.
    preds = np.argmax(predictions, axis=1).flatten()

    # Calculate simple flat accuracy -- number correct over total number.
    accuracy = (preds == true_labels).mean()

    return accuracy

In [None]:
def train_epoch(
  model, 
  data_loader, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
  model = model.train()

  total_train_loss = 0
  correct_predictions = 0
#   predictions = []
#   true_labels = []
  for i,d in enumerate(data_loader):
    if i%100 == 0:
        print(f"Processing batch {i+1}/{len(data_loader)}")
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    labels = d["labels"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      labels = labels
    )
    # print(outputs.size())
    # print(labels.size())
    loss = outputs.loss
    logits = outputs.logits
    # _, preds = torch.max(outputs, dim=1)
    # loss = loss_fn(outputs, labels)
    # print(logits)
    # print(labels)
    logits = logits.detach().cpu().numpy()
    labels = labels.to('cpu').numpy()
    
    # Store predictions and true labels
    # predictions.append(logits)
    # true_labels.append(labels)
    # Combine the results across the batches.
    # predictions = np.concatenate(predictions, axis=0)
    # true_labels = np.concatenate(true_labels, axis=0)

    # Choose the label with the highest score as our prediction.
    preds = np.argmax(logits, axis=1).flatten()

    # Calculate simple flat accuracy -- number correct over total number.
    correct_predictions += (preds == labels).sum()
    
    # Accumulate the training loss over all of the batches so that we can
    # calculate the average loss at the end.
    total_train_loss += loss.item()

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

#   training_accuracy = check_accuracy(predictions,true_labels)       
  
  return correct_predictions/n_examples, total_train_loss

In [None]:
def eval_model(model, data_loader, device, n_examples):
  model = model.eval()

  total_val_loss = 0
  correct_predictions = 0
#   predictions = []
#   true_labels = []

  with torch.no_grad():
    for i,d in enumerate(data_loader):
      if i%100 == 0:
          print(f"Processing batch {i+1}/{len(data_loader)}")
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      labels = d["labels"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels = labels
      )
      
      loss = outputs.loss
      logits = outputs.logits
      
      logits = logits.detach().cpu().numpy()
      labels = labels.to('cpu').numpy()
    
    #   # Store predictions and true labels
    #   predictions.append(logits)
    #   true_labels.append(labels)
      preds = np.argmax(logits, axis=1).flatten()

      # Calculate simple flat accuracy -- number correct over total number.
      correct_predictions += (preds == labels).sum()
   
      total_val_loss += loss.item()
  
#   val_accuracy = check_accuracy(predictions,true_labels)       
 
  return correct_predictions/n_examples, total_val_loss

In [None]:
val_acc, val_loss = eval_model(
    model,
    val_loader,
    device, 
    len(val_labels)
)

print(f'Val   loss {val_loss} accuracy {val_acc}')
print()




Processing batch 1/4378
Processing batch 101/4378
Processing batch 201/4378
Processing batch 301/4378
Processing batch 401/4378
Processing batch 501/4378
Processing batch 601/4378
Processing batch 701/4378
Processing batch 801/4378
Processing batch 901/4378
Processing batch 1001/4378
Processing batch 1101/4378
Processing batch 1201/4378
Processing batch 1301/4378
Processing batch 1401/4378
Processing batch 1501/4378
Processing batch 1601/4378
Processing batch 1701/4378
Processing batch 1801/4378
Processing batch 1901/4378
Processing batch 2001/4378
Processing batch 2101/4378
Processing batch 2201/4378
Processing batch 2301/4378
Processing batch 2401/4378
Processing batch 2501/4378
Processing batch 2601/4378
Processing batch 2701/4378
Processing batch 2801/4378
Processing batch 2901/4378
Processing batch 3001/4378
Processing batch 3101/4378
Processing batch 3201/4378
Processing batch 3301/4378
Processing batch 3401/4378
Processing batch 3501/4378
Processing batch 3601/4378
Processing ba

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in tqdm(range(num_epochs)):

  print(f'Epoch {epoch + 1}/{num_epochs}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_loader,     
    optimizer, 
    device, 
    scheduler, 
    len(train_labels)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_loader,
    device, 
    len(val_labels)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc




Epoch 1/4
----------
Processing batch 1/39396
Processing batch 101/39396
Processing batch 201/39396
Processing batch 301/39396
Processing batch 401/39396
Processing batch 501/39396
Processing batch 601/39396
Processing batch 701/39396
Processing batch 801/39396
Processing batch 901/39396
Processing batch 1001/39396
Processing batch 1101/39396
Processing batch 1201/39396
Processing batch 1301/39396
Processing batch 1401/39396
Processing batch 1501/39396
Processing batch 1601/39396
Processing batch 1701/39396
Processing batch 1801/39396
Processing batch 1901/39396
Processing batch 2001/39396
Processing batch 2101/39396
Processing batch 2201/39396
Processing batch 2301/39396
Processing batch 2401/39396
Processing batch 2501/39396
Processing batch 2601/39396
Processing batch 2701/39396
Processing batch 2801/39396
Processing batch 2901/39396
Processing batch 3001/39396
Processing batch 3101/39396
Processing batch 3201/39396
Processing batch 3301/39396
Processing batch 3401/39396
Processing 

KeyboardInterrupt: ignored

In [None]:
val_acc, val_loss = eval_model(
model,
val_loader,
device, 
len(val_labels)
)

print(f'Val   loss {val_loss} accuracy {val_acc}')



Processing batch 1/4378
Processing batch 101/4378
Processing batch 201/4378
Processing batch 301/4378
Processing batch 401/4378
Processing batch 501/4378
Processing batch 601/4378
Processing batch 701/4378
Processing batch 801/4378
Processing batch 901/4378
Processing batch 1001/4378
Processing batch 1101/4378
Processing batch 1201/4378
Processing batch 1301/4378
Processing batch 1401/4378
Processing batch 1501/4378
Processing batch 1601/4378
Processing batch 1701/4378
Processing batch 1801/4378
Processing batch 1901/4378
Processing batch 2001/4378
Processing batch 2101/4378
Processing batch 2201/4378
Processing batch 2301/4378
Processing batch 2401/4378
Processing batch 2501/4378
Processing batch 2601/4378
Processing batch 2701/4378
Processing batch 2801/4378
Processing batch 2901/4378
Processing batch 3001/4378
Processing batch 3101/4378
Processing batch 3201/4378
Processing batch 3301/4378
Processing batch 3401/4378
Processing batch 3501/4378
Processing batch 3601/4378
Processing ba

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

In [None]:
torch.save(model.state_dict(),'/content/drive/MyDrive/AmazonMLChallenge/xlm_roberta_model_2.pth')

In [None]:
class AmazonDataset_test(Dataset):

  def __init__(self,ids, sentences, tokenizer, max_length):
    self.sentences = sentences
    self.ids = ids
    self.tokenizer = tokenizer
    self.max_length = max_length
  
  def __len__(self):
    return len(self.sentences)
  
  def __getitem__(self, idx):
    sentence = str(self.sentences[idx])
    encoding = self.tokenizer.encode_plus(
      sentence,
      add_special_tokens=True,
      max_length=self.max_length,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )


    return {
        'sentence': sentence,
        'id':self.ids[idx],
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
    }

In [None]:
ds_test = AmazonDataset_test(
        sentences =test_df['TITLE'].values,
        ids = test_df['PRODUCT_ID'].values,
        tokenizer=tokenizer,
        max_length=max_input_length,
    )
test_loader = DataLoader(
        ds_test,
        batch_size=batch_size
    )

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  ids = []
  sentences = []
  predictions = []
  prediction_probs = []
  #real_values = []

  with torch.no_grad():
    for i,d in enumerate(data_loader):
      if i%100 == 0:
        print(f"Processing batch {i+1}/{len(data_loader)}")

      sents = d["sentence"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      id = d['id'].detach().cpu().numpy()
    #   labels = d["labels"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      
      logits = outputs.logits
      
      logits = logits.detach().cpu().numpy()
      
    #   # Store predictions and true labels
    #   predictions.append(logits)
    #   true_labels.append(labels)
      preds = np.argmax(logits, axis=1).flatten()

      # Calculate simple flat accuracy -- number correct over total number.
    #   correct_predictions += (preds == labels).sum()
    #   _, preds = torch.max(outputs, dim=1)

    #   probs = torch.nn.functional.softmax(outputs, dim=1)

      sentences.extend(sents)
      predictions.extend(preds)

      ids.extend(id)
    #   real_values.extend(labels)

#   predictions = torch.stack(predictions).cpu()

#   real_values = torch.stack(real_values).cpu()
  return ids,sentences, predictions#, prediction_probs,# real_values

In [None]:
ids,sents,predictions = get_predictions(model,test_loader)



Processing batch 1/1731
Processing batch 101/1731
Processing batch 201/1731
Processing batch 301/1731
Processing batch 401/1731
Processing batch 501/1731
Processing batch 601/1731
Processing batch 701/1731
Processing batch 801/1731
Processing batch 901/1731
Processing batch 1001/1731
Processing batch 1101/1731
Processing batch 1201/1731
Processing batch 1301/1731
Processing batch 1401/1731
Processing batch 1501/1731
Processing batch 1601/1731
Processing batch 1701/1731


In [None]:
df_sub = pd.DataFrame({
    'PRODUCT_ID':ids,
    'BROWSE_NODE_ID':le.inverse_transform(predictions)
})
df_sub

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,1140
1,2,15772
2,3,113
3,4,125
4,5,8915
...,...,...
110770,110771,4368
110771,110772,13568
110772,110773,13520
110773,110774,800


In [None]:
df_sub.to_csv('/content/drive/MyDrive/AmazonMLChallenge/submission_xlm_roberta.csv',index=False)