<div class="alert alert-block alert-success">

# **1.** **Setup**

<div>

In [1]:
# Import necessary libraries
from utils import *

In [2]:
# Set random seeds for reproducibility
tf.random.set_seed(221)
random.seed(221)
np.random.seed(221)
tf.random.set_seed(221)

In [3]:
# Import data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

<div class="alert alert-block alert-success">

# **2.** **Retrain Best Model**
Twitter RoBERTa Transformer Encoder

<div>

## 2.1 Pre processing

In [4]:
# Extract text and labels from the DataFrame
x_train = train_df['text']
x_test = test_df['text']
y_train = train_df['label']

In [5]:
# Clean the text data
x_train_cleaned = clean_text(x_train, lemmatize = True, stem = False)
x_test_cleaned = clean_text(x_test, lemmatize = True, stem = False)

100%|██████████| 9543/9543 [00:02<00:00, 4461.60it/s]
100%|██████████| 2388/2388 [00:00<00:00, 8852.42it/s] 


In [6]:
# Remove empty tweets from x_train_cleaned
non_empty_indices = [i for i, tweet in enumerate(x_train_cleaned) if tweet.strip() != '']

# Filter x_train, x_train_cleaned and y_train based on these indices 
x_train = x_train.iloc[non_empty_indices].reset_index(drop=True)
x_train_cleaned = [x_train_cleaned[i] for i in non_empty_indices]
y_train = y_train.iloc[non_empty_indices].reset_index(drop=True)

In [7]:
# Convert the list to a DataFrame
x_train_cleaned_df = pd.DataFrame(x_train_cleaned, columns=['text'], index=x_train.index)
x_train_cleaned_df = pd.DataFrame(x_test_cleaned, columns=['text'], index=x_test.index)

In [8]:
# Create a DataFrame with the cleaned text and labels
train_cleaned_df = pd.DataFrame({
    'text': x_train_cleaned,
    'label': y_train
}, index=x_train.index)

train_cleaned_df = pd.DataFrame({
    'text': x_train_cleaned,
    'label': y_train
}, index=x_train.index)

In [9]:
# Convert DataFrames to list
train_texts = train_cleaned_df['text'].tolist()
test_texts = x_test_cleaned

# Convert Series to list
train_labels = train_cleaned_df['label'].tolist()

## 2.2 Model

In [10]:
# Tokenizer & Model
checkpoint = "cardiffnlp/twitter-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# droupout
config = AutoConfig.from_pretrained(checkpoint, num_labels=3, hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.3)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config=config, ignore_mismatched_sizes=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Tokenization function
def tokenize(example):
    return tokenizer(example["text"],truncation=True)

In [12]:
# Oversample
train_texts_over, train_labels_over = oversample_data(train_texts, train_labels)

In [13]:
train_ds = Dataset.from_dict({"text": train_texts_over, "label": train_labels_over}).map(tokenize, batched=True)
test_ds = Dataset.from_dict({"text": test_texts}).map(tokenize, batched=True)

dataset = DatasetDict({"train": train_ds, "test": test_ds})

Map:   0%|          | 0/18522 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2388 [00:00<?, ? examples/s]

In [14]:
# Weighted loss
class_weights = torch.tensor(
    np.bincount(train_labels, minlength=3) / len(train_labels),
    dtype=torch.float
)
class_weights = 1.0 / class_weights
model.classifier.loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)

In [None]:
training_args = TrainingArguments(
    output_dir="./results/roberta_final",
    learning_rate=1.78e-05,
    num_train_epochs=6,
    weight_decay=0.01,
    warmup_ratio=0.14,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="no",         
    save_strategy="no",          
    load_best_model_at_end=False,  
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer)
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
100,1.1068
200,1.1122
300,1.1037
400,1.0928
500,1.0085
600,0.836
700,0.7694
800,0.7427
900,0.6833
1000,0.6995


In [None]:
train_preds = trainer.predict(dataset["train"])

train_pred_labels = np.argmax(train_preds.predictions, axis=1)

In [None]:
metrics_df = get_metrics_df("metrics", train_labels, train_pred_labels,  train_labels, train_pred_labels)
display(metrics_df)

plot_metrics(train_labels, train_pred_labels,  train_labels, train_pred_labels, title="metrics")

In [None]:
test_preds = trainer.predict(dataset["test"])

test_pred = np.argmax(test_preds.predictions, axis=1)

<div class="alert alert-block alert-success">

# **3.** **Final Predictions**

<div>

In [None]:
# Create submission DataFrame
submission = pd.DataFrame({
    "id": test_df['id'],
    "label": test_pred
})

# Save submission to CSV
submission.to_csv("pred_25.csv", index=False)
print("Submission file saved as pred_25.csv")

In [None]:
# Load the predictions from the saved CSV file
pred_25 = pd.read_csv("pred_25.csv")
pred_25.head()

In [None]:
# Count the frequency of each label
label_counts = pd.Series(test_pred).value_counts(normalize=True)

# Plot a simple stacked bar chart (just one bar stacked by label proportions)
plt.figure(figsize=(6, 4))
plt.bar(['Predictions'], [1], color='white')  # invisible base bar

bottom = 0
for label, proportion in label_counts.items():
    plt.bar(['Predictions'], [proportion], bottom=bottom, label=label)
    bottom += proportion

plt.ylabel('Proportion')
plt.title('Proportion of Predicted Labels (RoBERTa)')
plt.legend(title='Labels')
plt.ylim(0, 1)
plt.show()