<div class="alert alert-block alert-success">

# **1.** **Setup**

<div>

In [1]:
#!pip install -r requirements.txt

In [2]:
# Import necessary libraries
from utils import *

In [3]:
# Set random seeds for reproducibility
tf.random.set_seed(221)
random.seed(221)
np.random.seed(221)
tf.random.set_seed(221)

In [4]:
# Import data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

<div class="alert alert-block alert-success">

# **2.** **Retrain Best Model**
Twitter RoBERTa Transformer Encoder

<div>

## 2.1 Pre processing

In [5]:
# Extract text and labels from the DataFrame
x_train = train_df['text']
x_test = test_df['text']
y_train = train_df['label']

In [8]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [9]:
# Clean the text data
x_train_cleaned = clean_text(x_train, lemmatize = True, stem = False)
x_test_cleaned = clean_text(x_test, lemmatize = True, stem = False)

100%|██████████| 9543/9543 [00:06<00:00, 1476.75it/s]
100%|██████████| 2388/2388 [00:00<00:00, 3983.81it/s]


In [10]:
# Remove empty tweets from x_train_cleaned
non_empty_indices = [i for i, tweet in enumerate(x_train_cleaned) if tweet.strip() != '']

# Filter x_train, x_train_cleaned and y_train based on these indices
x_train = x_train.iloc[non_empty_indices].reset_index(drop=True)
x_train_cleaned = [x_train_cleaned[i] for i in non_empty_indices]
y_train = y_train.iloc[non_empty_indices].reset_index(drop=True)

In [11]:
# Convert the list to a DataFrame
x_train_cleaned_df = pd.DataFrame(x_train_cleaned, columns=['text'], index=x_train.index)
x_train_cleaned_df = pd.DataFrame(x_test_cleaned, columns=['text'], index=x_test.index)

In [12]:
# Create a DataFrame with the cleaned text and labels
train_cleaned_df = pd.DataFrame({
    'text': x_train_cleaned,
    'label': y_train
}, index=x_train.index)

train_cleaned_df = pd.DataFrame({
    'text': x_train_cleaned,
    'label': y_train
}, index=x_train.index)

In [13]:
# Convert DataFrames to list
train_texts = train_cleaned_df['text'].tolist()
test_texts = x_test_cleaned

# Convert Series to list
train_labels = train_cleaned_df['label'].tolist()

## 2.2 Model

In [14]:
# Tokenizer & Model
checkpoint = "cardiffnlp/twitter-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# droupout
config = AutoConfig.from_pretrained(checkpoint, num_labels=3, hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.3)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config=config, ignore_mismatched_sizes=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Tokenization function
def tokenize(example):
    return tokenizer(example["text"],truncation=True)

In [16]:
# Oversample
train_texts_over, train_labels_over = oversample_data(train_texts, train_labels)

In [17]:
train_ds = Dataset.from_dict({"text": train_texts_over, "label": train_labels_over}).map(tokenize, batched=True)
test_ds = Dataset.from_dict({"text": test_texts}).map(tokenize, batched=True)

dataset = DatasetDict({"train": train_ds, "test": test_ds})

Map:   0%|          | 0/18522 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2388 [00:00<?, ? examples/s]

In [18]:
# Weighted loss
class_weights = torch.tensor(
    np.bincount(train_labels, minlength=3) / len(train_labels),
    dtype=torch.float
)
class_weights = 1.0 / class_weights
model.classifier.loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)

In [19]:
training_args = TrainingArguments(
    output_dir="./results/roberta_final",
    learning_rate=1.78e-05,
    num_train_epochs=6,
    weight_decay=0.01,
    warmup_ratio=0.14,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="no",
    save_strategy="no",
    load_best_model_at_end=False,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer)
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
100,1.1023
200,1.1004
300,1.0944
400,1.0778
500,0.896
600,0.7735
700,0.7675
800,0.7147
900,0.6832
1000,0.6771


TrainOutput(global_step=6948, training_loss=0.45579510525718253, metrics={'train_runtime': 880.3512, 'train_samples_per_second': 126.236, 'train_steps_per_second': 7.892, 'total_flos': 1682393443358292.0, 'train_loss': 0.45579510525718253, 'epoch': 6.0})

In [32]:
train_ds = Dataset.from_dict({"text": train_texts, "label": train_labels}).map(tokenize, batched=True)
test_ds = Dataset.from_dict({"text": test_texts}).map(tokenize, batched=True)

dataset = DatasetDict({"train": train_ds, "test": test_ds})

Map:   0%|          | 0/9539 [00:00<?, ? examples/s]

Map:   0%|          | 0/2388 [00:00<?, ? examples/s]

In [33]:
test_preds = trainer.predict(dataset["test"])

test_pred = np.argmax(test_preds.predictions, axis=1)

<div class="alert alert-block alert-success">

# **3.** **Final Predictions**

<div>

In [34]:
# Create submission DataFrame
submission = pd.DataFrame({
    "id": test_df['id'],
    "label": test_pred
})

# Save submission to CSV
submission.to_csv("pred_25.csv", index=False)
print("Submission file saved as pred_25.csv")

Submission file saved as pred_25.csv


In [35]:
# Load the predictions from the saved CSV file
pred_25 = pd.read_csv("pred_25.csv")
pred_25.head()

Unnamed: 0,id,label
0,0,1
1,1,2
2,2,2
3,3,1
4,4,2


In [36]:
# Count the frequency of each label
label_counts = pd.Series(test_pred).value_counts(normalize=True) * 100
label_counts

Unnamed: 0,proportion
2,53.140704
1,25.041876
0,21.81742
