In [1]:
import numpy as np
import pandas as pd
import json
import warnings

warnings.filterwarnings("ignore")
train_set = pd.read_csv("datasets/train.csv", encoding = 'latin-1')
targets_list = (train_set["target"].unique()).tolist()
train_set['label'] = train_set['target'].apply(lambda t: targets_list.index(t) if t in targets_list else -1)

In [2]:
print(targets_list)

['academic interests', 'arts and culture', 'automotives', 'books and literature', 'business and finance', 'careers', 'family and relationships', 'food and drinks', 'health', 'healthy living', 'hobbies and interests', 'home and garden', 'movies', 'music and audio', 'news and politics', 'personal finance', 'pets', 'pharmaceuticals, conditions, and symptoms', 'real estate', 'shopping', 'sports', 'style and fashion', 'technology and computing', 'television', 'travel', 'video gaming']


In [3]:
print(sorted(targets_list))

['academic interests', 'arts and culture', 'automotives', 'books and literature', 'business and finance', 'careers', 'family and relationships', 'food and drinks', 'health', 'healthy living', 'hobbies and interests', 'home and garden', 'movies', 'music and audio', 'news and politics', 'personal finance', 'pets', 'pharmaceuticals, conditions, and symptoms', 'real estate', 'shopping', 'sports', 'style and fashion', 'technology and computing', 'television', 'travel', 'video gaming']


In [4]:
len(targets_list)

26

In [5]:
!pip install nltk







In [6]:
from transformers import AutoTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [7]:
import random
import nltk
from nltk.corpus import wordnet as wn
import re
# Make sure to download the required NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')

def get_synonym(word):
    synonyms = wn.synsets(word)
    if synonyms:
        words = set(chain.from_iterable([syn.lemma_names() for syn in synonyms]))
        words.discard(word)  # Avoid returning the same word
        if words:
            return random.choice(list(words))
    return word

def augment_text(text):
    words = text.split()
    augmented_text = []
    for word in words:
        if random.random() < 0.3:  # 30% chance of replacing a word
            augmented_text.append(get_synonym(word))
        else:
            augmented_text.append(word)
    return ' '.join(augmented_text)

def augment_dataframe(df, fraction):
    to_augment = df.sample(frac=fraction).index
    df['text'] = df['text'].apply(lambda x: augment_text(x) if x in to_augment else x)
    return df


class TextCleaner():
    def __init__(self):
        pass
    
    def clean_text(self, text):
        text = (str(text)).lower()
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

cleaner = TextCleaner()

[nltk_data] Downloading package wordnet to

[nltk_data]     /teamspace/studios/this_studio/nltk_data...

[nltk_data]   Package wordnet is already up-to-date!

[nltk_data] Downloading package omw-1.4 to

[nltk_data]     /teamspace/studios/this_studio/nltk_data...

[nltk_data]   Package omw-1.4 is already up-to-date!


# Preparing Data

In [8]:
train_set['text'] = train_set['text'].apply(cleaner.clean_text)

In [9]:
df_shuffled = train_set.sample(frac=1, random_state=42).reset_index(drop=True)

selected_samples = pd.DataFrame()

# Get unique classes
unique_classes = df_shuffled['target'].unique()

# Step 3: Ensure each class has at least 15,000 samples
for class_name in unique_classes:
    class_samples = df_shuffled[df_shuffled['target'] == class_name]
    
    # Check if there are enough samples for the class
    if len(class_samples) >= 16000:
        selected_samples = pd.concat([selected_samples, class_samples.sample(n=16000, random_state=42)])

# Step 4: Calculate remaining samples needed
remaining_samples_needed = 675000 - len(selected_samples)

# Step 5: Select the remaining samples randomly from the rest of the dataframe
remaining_df = df_shuffled[~df_shuffled.index.isin(selected_samples.index)]
additional_samples = remaining_df.sample(n=remaining_samples_needed, random_state=42)

# Step 6: Combine selected samples and additional samples
train_df = pd.concat([selected_samples, additional_samples]).sample(frac=1, random_state=42)

# Step 7: Split the remaining data into validation and evaluation datasets
remaining_df = remaining_df[~remaining_df.index.isin(train_df .index)]
val_df = remaining_df.sample(frac=0.95, random_state=42)
eval_df = remaining_df.drop(val_df.index)

# Optional: Reset indices if needed
train_df .reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
eval_df.reset_index(drop=True, inplace=True)

len(train_df)

675000

In [10]:
len(train_df), len(val_df), len(eval_df)

(675000, 21401, 1126)

In [11]:
train_df = augment_dataframe(train_df, fraction = 0.30)

In [12]:
train_df = train_df.sample(frac=1, random_state=43).reset_index(drop=True)

In [13]:
df = pd.DataFrame(train_df.groupby(["target"]).count())
df

Unnamed: 0_level_0,text,Word Count,label
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
academic interests,55208,55208,55208
arts and culture,25516,25516,25516
automotives,27074,27074,27074
books and literature,38493,38493,38493
business and finance,27088,27088,27088
careers,29322,29322,29322
family and relationships,27928,27928,27928
food and drinks,22848,22848,22848
health,18706,18706,18706
healthy living,29744,29744,29744


In [14]:
train_data = Dataset.from_pandas(train_df[['text', 'label']])
val_data = Dataset.from_pandas(val_df[['text', 'label']])
train_data, val_data

(Dataset({
     features: ['text', 'label'],
     num_rows: 675000
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 21401
 }))

## Training

In [15]:
#<your wandb key
!pip install huggingface_hub
from huggingface_hub import login

api_token = '<API_Token'
login(api_token)













The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.

Token is valid (permission: fineGrained).

Your token has been saved to /teamspace/studios/this_studio/.cache/huggingface/token

Login successful


In [16]:
from transformers import AutoTokenizer, RobertaForSequenceClassification

model_name = "pilotj/roberta-base-pretrained-v1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

In [18]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=512)

encoded_train_data = train_data.map(preprocess_function, batched=True, batch_size=256)
encoded_val_data = val_data.map(preprocess_function, batched=True, batch_size=256)
encoded_train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
encoded_val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/675000 [00:00<?, ? examples/s]

Map:   0%|          | 0/21401 [00:00<?, ? examples/s]

In [19]:
len(encoded_train_data), len(encoded_val_data)

(675000, 21401)

In [29]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

def compute_metrics(p):
    # Get predictions and true labels
    preds = np.argmax(p.predictions, axis=1)  # p.predictions are logits, take argmax to get class predictions
    labels = p.label_ids  # True labels

    f1_macro = f1_score(labels, preds, average='macro')  # or 'macro' / 'micro'
    f1_w = f1_score(labels, preds, average='weighted')
    # Calculate other optional metrics if needed
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')

    # Return metrics as a dictionary
    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_w': f1_w,
        'precision': precision,
        'recall': recall
    }


In [33]:
# import torch
# torch.cuda.empty_cache()

In [35]:
training_args = TrainingArguments(
    output_dir='results',
    overwrite_output_dir=True,
    learning_rate=2e-5,
    save_total_limit=3,
    push_to_hub=True,
    hub_model_id="pilotj/roberta-base-v1",
    hub_strategy="checkpoint",
    save_steps=1000,
    eval_steps=500,
    save_strategy="steps",
    eval_strategy="steps",
    load_best_model_at_end=True,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16 = True
)

In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_data,
    eval_dataset=encoded_val_data,
    compute_metrics = compute_metrics
)

In [37]:
warnings.filterwarnings("ignore")
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 W,Precision,Recall
500,0.3932,0.413825,0.880286,0.850493,0.881589,0.884721,0.880286
1000,0.3997,0.409719,0.88094,0.849948,0.882442,0.886069,0.88094
1500,0.3997,0.412584,0.881781,0.851393,0.883361,0.887383,0.881781
2000,0.3907,0.39875,0.884351,0.85442,0.885605,0.888745,0.884351
2500,0.3881,0.395556,0.886174,0.854916,0.88714,0.890059,0.886174
3000,0.3558,0.397092,0.886267,0.857005,0.887421,0.890191,0.886267
3500,0.3526,0.399867,0.885239,0.855845,0.886699,0.890165,0.885239
4000,0.3435,0.399148,0.885753,0.856547,0.887043,0.89026,0.885753
4500,0.3428,0.39292,0.88594,0.857234,0.887125,0.890092,0.88594
5000,0.3392,0.392029,0.886734,0.857584,0.887973,0.890896,0.886734


TrainOutput(global_step=5274, training_loss=0.36853346320142516, metrics={'train_runtime': 5892.0295, 'train_samples_per_second': 229.123, 'train_steps_per_second': 0.895, 'total_flos': 3.552764654592e+17, 'train_loss': 0.36853346320142516, 'epoch': 2.0})

In [38]:
model.push_to_hub("pilotj/roberta-base-v1")
tokenizer.push_to_hub("pilotj/roberta-base-v1")
trainer.push_to_hub("pilotj/roberta-base-v1")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

events.out.tfevents.1727893591.ip-10-192-11-81.1134.3:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pilotj/roberta-base-v1/commit/dcf1cdea07d3935847d86eef5efb2fb5a9431532', commit_message='pilotj/roberta-base-v1', commit_description='', oid='dcf1cdea07d3935847d86eef5efb2fb5a9431532', pr_url=None, repo_url=RepoUrl('https://huggingface.co/pilotj/roberta-base-v1', endpoint='https://huggingface.co', repo_type='model', repo_id='pilotj/roberta-base-v1'), pr_revision=None, pr_num=None)

In [39]:
# Evaluate the model
eval_data = Dataset.from_pandas(eval_df[['text', 'label']])
encoded_eval_data = eval_data.map(preprocess_function, batched=True, batch_size=256)
encoded_eval_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
trainer.evaluate(encoded_eval_data)

Map:   0%|          | 0/1126 [00:00<?, ? examples/s]

{'eval_loss': 0.3762845993041992,
 'eval_accuracy': 0.8863232682060391,
 'eval_f1_macro': 0.8644397972414607,
 'eval_f1_w': 0.8869261992122593,
 'eval_precision': 0.8908920626537057,
 'eval_recall': 0.8863232682060391,
 'eval_runtime': 1.3749,
 'eval_samples_per_second': 818.941,
 'eval_steps_per_second': 13.091,
 'epoch': 2.0}