# Inference

In [1]:
!pip install huggingface_hub
from huggingface_hub import login

api_token = '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'
login(api_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
!pip install nltk



In [3]:
import random
import nltk
from nltk.corpus import wordnet as wn
import re
# Make sure to download the required NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')

def get_synonym(word):
    synonyms = wn.synsets(word)
    if synonyms:
        words = set(chain.from_iterable([syn.lemma_names() for syn in synonyms]))
        words.discard(word)  # Avoid returning the same word
        if words:
            return random.choice(list(words))
    return word

def augment_text(text):
    words = text.split()
    augmented_text = []
    for word in words:
        if random.random() < 0.3:  # 30% chance of replacing a word
            augmented_text.append(get_synonym(word))
        else:
            augmented_text.append(word)
    return ' '.join(augmented_text)

def augment_dataframe(df, fraction):
    to_augment = df.sample(frac=fraction).index
    df['text'] = df['text'].apply(lambda x: augment_text(x) if x in to_augment else x)
    return df

class TextCleaner():
    def __init__(self):
        pass
    
    def clean_text(self, text):
        text = (str(text)).lower()
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

cleaner = TextCleaner()

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [6]:
import pandas as pd
import torch
from transformers import AutoTokenizer, RobertaForSequenceClassification
from datasets import Dataset

# Define your model repository name
model_name = "pilotj/roberta-base-v1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

model.to("cuda:0")

# Set the model to evaluation mode
model.eval()

# Load your test data
test_data = pd.read_csv("/kaggle/input/fibe-dataset-v2/dataset/test.csv" , encoding='latin-1')  # Replace with your actual test data file path
test_data['text'] = test_data['text'].apply(cleaner.clean_text)
augment_dataframe(test_data, fraction = 0.30)

test_dataset = Dataset.from_pandas(test_data[['text']])

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=512)

test_inputs = test_dataset.map(preprocess_function, batched=True, batch_size=64)

test_inputs.set_format(type='torch', columns=['input_ids', 'attention_mask'])
test_inputs

Map:   0%|          | 0/174382 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 174382
})

In [7]:
train_set = pd.read_csv("/kaggle/input/fibe-dataset-v2/dataset/train.csv", encoding = 'latin-1')
targets_list = (train_set["target"].unique()).tolist()

In [9]:
targets_list_dum = ['academic interests', 'arts and culture', 'automotives', 'books and literature', 'business and finance', 'careers', 'family and relationships', 'food and drinks', 'health', 'healthy living', 'hobbies and interests', 'home and garden', 'movies', 'music and audio', 'news and politics', 'personal finance', 'pets', 'pharmaceuticals, conditions, and symptoms', 'real estate', 'shopping', 'sports', 'style and fashion', 'technology and computing', 'television', 'travel', 'video gaming']
targets_list_dum == targets_list

True

In [10]:
import pandas as pd
import torch

# Create empty dataframe
result = pd.DataFrame(columns=["target", "Index"])

def inference_fn(test_inputs, test_data, model, result, batch):
    # Wrap model in DataParallel
    model = torch.nn.DataParallel(model).to("cuda:0")  # Move model to the first GPU

    for i in range(0, len(test_data), batch):
        indexes = test_data[i:i+batch]['Index'].tolist()
        input_ids = test_inputs['input_ids'][i:i+batch].to("cuda:0")
        attention_mask = test_inputs['attention_mask'][i:i+batch].to("cuda:0")
        
        # Perform inference
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get the predicted class labels
        predictions = torch.argmax(outputs.logits, dim=1)

        # Convert predictions to a list
        predicted_labels = predictions.cpu().numpy().tolist()
        preds = [targets_list[i] for i in predicted_labels]

        batch_df = pd.DataFrame({
            'target': preds,
            'Index': indexes
        })
        result = pd.concat([result, batch_df], ignore_index=True)

        if i % (batch * 4) == 0:
            print(f"{i} done.")

    return result


In [11]:
result_df = inference_fn(test_inputs, test_data, model, result, batch=512)

0 done.
2048 done.
4096 done.
6144 done.
8192 done.
10240 done.
12288 done.
14336 done.
16384 done.
18432 done.
20480 done.
22528 done.
24576 done.
26624 done.
28672 done.
30720 done.
32768 done.
34816 done.
36864 done.
38912 done.
40960 done.
43008 done.
45056 done.
47104 done.
49152 done.
51200 done.
53248 done.
55296 done.
57344 done.
59392 done.
61440 done.
63488 done.
65536 done.
67584 done.
69632 done.
71680 done.
73728 done.
75776 done.
77824 done.
79872 done.
81920 done.
83968 done.
86016 done.
88064 done.
90112 done.
92160 done.
94208 done.
96256 done.
98304 done.
100352 done.
102400 done.
104448 done.
106496 done.
108544 done.
110592 done.
112640 done.
114688 done.
116736 done.
118784 done.
120832 done.
122880 done.
124928 done.
126976 done.
129024 done.
131072 done.
133120 done.
135168 done.
137216 done.
139264 done.
141312 done.
143360 done.
145408 done.
147456 done.
149504 done.
151552 done.
153600 done.
155648 done.
157696 done.
159744 done.
161792 done.
163840 done.
1658

In [12]:
result_df.to_csv("submission_final_fibe.csv", index=False)