# Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd './drive/MyDrive/hasoc/task1'

/content/drive/MyDrive/hasoc/task1


# Installing Libraries

In [3]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install googletrans==4.0.0-rc1
# !pip install googletrans
!pip install accelerate -U
!pip install evaluate

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

# Imports and Set Seed

In [4]:
import random
import torch
import numpy as np

GLOBAL_SEED = 10

np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


# Reading Original Data

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('train_v2.csv', sep = '\t', index_col = 0)

In [None]:
df

In [None]:
df.iloc[0]['text']

# Translate Original Text To English

In [None]:
from googletrans import Translator

def translate_sinhala_to_english(text):
    translator = Translator()
    translated = translator.translate(text, src='si', dest='en')
    return translated.text



In [None]:
# def train_val_test_split(df, train_portion = 0.98, val_portion = 0.01, test_portion = 0.01):
# df_train, df_val, df_test, _ = np.split(df.sample(frac=1, random_state=42), [int(train_portion * len(df)), int((train_portion + val_portion) * len(df)), int((train_portion + val_portion + test_portion) * len(df))])

df1, df2, df3, df4 = np.split(df.sample(frac=1, random_state=42), [int(len(df)/4), int(len(df)/2), int(3/4 * len(df))])

In [None]:
df1.to_csv("Task1-1.csv")
df2.to_csv("Task1-2.csv")
df3.to_csv("Task1-3.csv")
df4.to_csv("Task1-4.csv")

In [None]:
import numpy as np
import pandas as pd

df1 = pd.read_csv('Task1-1.csv', index_col = 0)

In [None]:
df1

Unnamed: 0_level_0,text,label
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1341732273364869121,බයික් එකේ...ආපු දාට.. පන කඩාගෙන වහිනව... රෙද්ද...,HOF
882766002684952580,""" නාකි විසේ ගෙටත් උසේ..! "" සැමියාට නිදිපෙති පො...",HOF
1132325746016305153,@USER @USER ගොනා බල්ලා බූරුව එක්කො එපා මේ සත...,NOT
1237345250773008384,@USER @USER අයියා දන්නවා හොදටම මං හිතන්නේ ඔය ...,NOT
984986753973940226,වේස පකයලා උදේ පාන්දර රතිඤ්ඤා දානවා හතර වටේ නිද...,HOF
...,...,...
1178394144374325248,ගිය සතියේ පාටියකට ගියා දේශපාලන උනුසුම එතනත් සැ...,NOT
1326206605742911494,#MI දිනයි කියලා දැනන් හිටියට ගේමක් නැතුව ගහලා ...,HOF
601936668099223553,අමු සොහොනක නිදිවරණ මම වෙමි හොල්මන... නැති...,NOT
1199622565330853888,උන් පණදාගෙන අඳිනවා. . .මුන් ඊට උඩින් බැනර් පෝස...,HOF


In [None]:
from tqdm import tqdm
tqdm.pandas()
df1['english_translation'] = df1['text'].progress_apply(translate_sinhala_to_english)

100%|██████████| 1875/1875 [21:40<00:00,  1.44it/s]


In [None]:
df1.to_csv("df1.csv")

In [None]:
# df['english_translation'] = df['text'].apply(translate_sinhala_to_english)
# from tqdm import tqdm
# tqdm.pandas()
# df['english_translation'] = df['text'].progress_apply(translate_sinhala_to_english)

In [None]:
# df.to_csv('Task1A.csv')

# Reading Translated Data

In [5]:
import pandas as pd
csv_files = ['df1.csv', 'df2.csv', 'df3.csv', 'df4.csv']
tmp_df = []
for csv_file in csv_files:
    tmp_df.append(pd.read_csv(csv_file, index_col = 0))
df = pd.concat(tmp_df)

In [6]:
df

Unnamed: 0_level_0,text,label,english_translation
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1341732273364869121,බයික් එකේ...ආපු දාට.. පන කඩාගෙන වහිනව... රෙද්ද...,HOF,On the bike ... when you came and break the br...
882766002684952580,""" නාකි විසේ ගෙටත් උසේ..! "" සැමියාට නිදිපෙති පො...",HOF,"""Nakaye's home!Medapararadiga URL"
1132325746016305153,@USER @USER ගොනා බල්ලා බූරුව එක්කො එපා මේ සත...,NOT,@Ser @user bull dog donkey don't have the donk...
1237345250773008384,@USER @USER අයියා දන්නවා හොදටම මං හිතන්නේ ඔය ...,NOT,@Ser @user brother knows I think you think of ...
984986753973940226,වේස පකයලා උදේ පාන්දර රතිඤ්ඤා දානවා හතර වටේ නිද...,HOF,"The next morning, firecrackers will not sleep ..."
...,...,...,...
678135651368243200,රනිල්ගේ හුජ්ජ කොල්ලාට වෘත්තීය සමිති නායකයෝ පයි...,HOF,Ven.
1213423038663806976,ලිංගික අවයව හිතුනු වෙලාවට අප්ග්‍රේඩ් කරන්න මාර...,HOF,If you could be able to get to the change at t...
1126797643320053760,@USER මහ ජනතාවට තොරතුරු හන්ගලා මොනවද බලාපොරොත්...,NOT,What do you want to information info?What a go...
986305964834865152,ඔය best life කියල තියන මගුල් කරන්න ගිහින් life...,NOT,Don't be stupid to wolf the best Life.


In [7]:
unique_labels = df['label'].unique()
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for id, label in enumerate(unique_labels)}
num_labels = len(unique_labels)

In [8]:
id2label

{0: 'HOF', 1: 'NOT'}

In [9]:
df.rename(columns = {'label': 'hate_label'}, inplace = True)
df['label'] = df['hate_label'].apply(lambda x: label2id[x])

In [10]:
df

Unnamed: 0_level_0,text,hate_label,english_translation,label
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1341732273364869121,බයික් එකේ...ආපු දාට.. පන කඩාගෙන වහිනව... රෙද්ද...,HOF,On the bike ... when you came and break the br...,0
882766002684952580,""" නාකි විසේ ගෙටත් උසේ..! "" සැමියාට නිදිපෙති පො...",HOF,"""Nakaye's home!Medapararadiga URL",0
1132325746016305153,@USER @USER ගොනා බල්ලා බූරුව එක්කො එපා මේ සත...,NOT,@Ser @user bull dog donkey don't have the donk...,1
1237345250773008384,@USER @USER අයියා දන්නවා හොදටම මං හිතන්නේ ඔය ...,NOT,@Ser @user brother knows I think you think of ...,1
984986753973940226,වේස පකයලා උදේ පාන්දර රතිඤ්ඤා දානවා හතර වටේ නිද...,HOF,"The next morning, firecrackers will not sleep ...",0
...,...,...,...,...
678135651368243200,රනිල්ගේ හුජ්ජ කොල්ලාට වෘත්තීය සමිති නායකයෝ පයි...,HOF,Ven.,0
1213423038663806976,ලිංගික අවයව හිතුනු වෙලාවට අප්ග්‍රේඩ් කරන්න මාර...,HOF,If you could be able to get to the change at t...,0
1126797643320053760,@USER මහ ජනතාවට තොරතුරු හන්ගලා මොනවද බලාපොරොත්...,NOT,What do you want to information info?What a go...,1
986305964834865152,ඔය best life කියල තියන මගුල් කරන්න ගිහින් life...,NOT,Don't be stupid to wolf the best Life.,1


# Defining Model and Tokenize Dataset

In [11]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_checkpoint = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")
# model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels, label2id=label2id, id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
model

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [27]:
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

# dataset = load_dataset("your_dataset_name")  # Replace with your dataset name
# dataset = ("your_dataset_name")
from datasets import Dataset
dataset = Dataset.from_pandas(df)
# processed_dataset
raw_datasets = dataset.train_test_split(test_size=0.1, seed=GLOBAL_SEED, shuffle=True)
raw_datasets['validation'] = raw_datasets.pop('test')


def tokenize_function(examples):
    # return tokenizer(examples["text"], padding="max_length", truncation=True)
    return tokenizer(examples["english_translation"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)#, remove_columns=['text', 'hate_label'])

Map:   0%|          | 0/6750 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

In [14]:
# from datasets import load_dataset
# from transformers import AutoTokenizer
# from sklearn.model_selection import train_test_split
# from transformers import DataCollatorForLanguageModeling
# from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
# from datasets import Dataset, DatasetDict

# # Load the dataset using the datasets library
# # dataset = load_dataset("your_dataset_name")  # Replace with your dataset name

# # Convert the dataset to a pandas DataFrame
# # df = dataset.to_pandas()


# # Split the dataset into train and test sets while maintaining class balance
# X = df.drop('label', axis=1)  # Replace 'target_column' with your actual target column name
# y = df['label']
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y, random_state=GLOBAL_SEED)

# # Convert the training and testing sets back to datasets
# train_dataset = Dataset.from_pandas(pd.concat([X_train, y_train], axis=1))
# val_dataset = Dataset.from_pandas(pd.concat([X_val, y_val], axis=1))

# # Tokenization
# # tokenizer = AutoTokenizer.from_pretrained("your_pretrained_model")  # Replace with your tokenizer name
# def tokenize_function(examples):
#     return tokenizer(examples["english_translation"], padding="max_length", truncation=True)

# tokenized_train_datasets = train_dataset.map(tokenize_function, batched=True)
# tokenized_val_datasets = val_dataset.map(tokenize_function, batched=True)


# tokenized_datasets = DatasetDict({"train": tokenized_train_datasets, "validation": tokenized_val_datasets})

Map:   0%|          | 0/6750 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/750 [00:00<?, ? examples/s]

In [36]:
# Convert the train and validation datasets to pandas DataFrames
train_df = tokenized_datasets["train"].to_pandas()
val_df = tokenized_datasets["validation"].to_pandas()

# Save train and validation DataFrames to CSV files
train_df.to_csv("sinhala_train_dataset.csv", index=False)
val_df.to_csv("sinhala_val_dataset.csv", index=False)

In [28]:
tokenized_datasets
# tokenized_train_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'hate_label', 'english_translation', 'label', 'post_id', 'input_ids', 'attention_mask'],
        num_rows: 6750
    })
    validation: Dataset({
        features: ['text', 'hate_label', 'english_translation', 'label', 'post_id', 'input_ids', 'attention_mask'],
        num_rows: 750
    })
})

In [29]:
# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=False  # Set to True if your task is Masked Language Modeling
# )
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Train Model

In [30]:
training_args = TrainingArguments(
    output_dir="./output",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    learning_rate=2e-5,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    # train_dataset=processed_dataset["train"],
    # eval_dataset=processed_dataset["validation"],
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator
)

# trainer.train(resume_from_checkpoint="last-checkpoint")
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.588,0.548405
2,0.4313,0.570433
3,0.3158,1.090361
4,0.197,1.668435
5,0.1291,1.740947


TrainOutput(global_step=4220, training_loss=0.33007479256363276, metrics={'train_runtime': 1189.5193, 'train_samples_per_second': 28.373, 'train_steps_per_second': 3.548, 'total_flos': 1018302478674360.0, 'train_loss': 0.33007479256363276, 'epoch': 5.0})

In [32]:
predictions = trainer.predict(tokenized_datasets["validation"])
# print(predictions.predictions.shape, predictions.label_ids.shape)
preds = np.argmax(predictions.predictions, axis=-1)

In [33]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Evaluate on validation data
results = trainer.evaluate()

# Calculate metrics
validation_loss = results["eval_loss"]
accuracy = accuracy_score(tokenized_datasets["validation"]["label"], preds)
precision, recall, f1, _ = precision_recall_fscore_support(tokenized_datasets["validation"]["label"], preds, average="weighted")

# Print the metrics
print(f"Validation Loss: {validation_loss}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Validation Loss: 0.5484048128128052
Accuracy: 0.7186666666666667
Precision: 0.7164923641173641
Recall: 0.7186666666666667
F1 Score: 0.7164818812810687


In [37]:
test_df = pd.read_csv('sinhala_test_translated.csv', index_col = 0)


from datasets import load_dataset
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, DatasetDict

# Convert the training and testing sets back to datasets
test_dataset = Dataset.from_pandas(test_df)


tokenized_test_datasets = test_dataset.map(tokenize_function, batched=True)


tokenized_datasets["test"] = tokenized_test_datasets

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [49]:
# Save model predictions on the test dataset
test_predictions = trainer.predict(tokenized_datasets["test"])
test_preds = np.argmax(test_predictions.predictions, axis=-1)

# Save the predictions to a file (e.g., CSV)
# test_predictions_df = pd.DataFrame({"predictions": test_preds})
test_predictions_df = test_df.copy()
test_predictions_df['predictions'] = test_preds
test_predictions_df.to_csv("test_predictions_xlmt.csv")

In [40]:
# tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'hate_label', 'english_translation', 'label', 'post_id', 'input_ids', 'attention_mask'],
        num_rows: 6750
    })
    validation: Dataset({
        features: ['text', 'hate_label', 'english_translation', 'label', 'post_id', 'input_ids', 'attention_mask'],
        num_rows: 750
    })
    test: Dataset({
        features: ['text', 'english_translation', 'post_id', 'input_ids', 'attention_mask'],
        num_rows: 2500
    })
})

In [39]:
# raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'hate_label', 'english_translation', 'label', 'post_id'],
        num_rows: 6750
    })
    validation: Dataset({
        features: ['text', 'hate_label', 'english_translation', 'label', 'post_id'],
        num_rows: 750
    })
})

In [41]:
# # Convert the train and validation datasets to pandas DataFrames
# train_df = raw_datasets["train"].to_pandas()
# val_df = raw_datasets["validation"].to_pandas()

# # Save train and validation DataFrames to CSV files
# train_df.to_csv("sinhala_train_dataset.csv", index=False)
# val_df.to_csv("sinhala_val_dataset.csv", index=False)

In [43]:
# # Convert the train and validation datasets to pandas DataFrames
# train_df = raw_datasets["train"].to_pandas()
# val_df = raw_datasets["validation"].to_pandas()

# # Save train and validation DataFrames to CSV files
# train_df.to_csv("sinhala_train_dataset.csv", index=False)
# val_df.to_csv("sinhala_val_dataset.csv", index=False)