# Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd './drive/MyDrive/hasoc/task1'

/content/drive/MyDrive/hasoc/task1


# Installing Libraries

In [3]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install googletrans==4.0.0-rc1
# !pip install googletrans
!pip install accelerate -U
!pip install evaluate

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

# Imports and Set Seed

In [4]:
import random
import torch
import numpy as np

GLOBAL_SEED = 10

np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


# Reading Original Data

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('train_v2.csv', sep = '\t', index_col = 0)

In [None]:
df

In [None]:
df.iloc[0]['text']

# Translate Original Text To English

In [None]:
from googletrans import Translator

def translate_sinhala_to_english(text):
    translator = Translator()
    translated = translator.translate(text, src='si', dest='en')
    return translated.text



In [None]:
# def train_val_test_split(df, train_portion = 0.98, val_portion = 0.01, test_portion = 0.01):
# df_train, df_val, df_test, _ = np.split(df.sample(frac=1, random_state=42), [int(train_portion * len(df)), int((train_portion + val_portion) * len(df)), int((train_portion + val_portion + test_portion) * len(df))])

df1, df2, df3, df4 = np.split(df.sample(frac=1, random_state=42), [int(len(df)/4), int(len(df)/2), int(3/4 * len(df))])

In [None]:
df1.to_csv("Task1-1.csv")
df2.to_csv("Task1-2.csv")
df3.to_csv("Task1-3.csv")
df4.to_csv("Task1-4.csv")

In [None]:
import numpy as np
import pandas as pd

df1 = pd.read_csv('Task1-1.csv', index_col = 0)

In [None]:
df1

Unnamed: 0_level_0,text,label
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1341732273364869121,බයික් එකේ...ආපු දාට.. පන කඩාගෙන වහිනව... රෙද්ද...,HOF
882766002684952580,""" නාකි විසේ ගෙටත් උසේ..! "" සැමියාට නිදිපෙති පො...",HOF
1132325746016305153,@USER @USER ගොනා බල්ලා බූරුව එක්කො එපා මේ සත...,NOT
1237345250773008384,@USER @USER අයියා දන්නවා හොදටම මං හිතන්නේ ඔය ...,NOT
984986753973940226,වේස පකයලා උදේ පාන්දර රතිඤ්ඤා දානවා හතර වටේ නිද...,HOF
...,...,...
1178394144374325248,ගිය සතියේ පාටියකට ගියා දේශපාලන උනුසුම එතනත් සැ...,NOT
1326206605742911494,#MI දිනයි කියලා දැනන් හිටියට ගේමක් නැතුව ගහලා ...,HOF
601936668099223553,අමු සොහොනක නිදිවරණ මම වෙමි හොල්මන... නැති...,NOT
1199622565330853888,උන් පණදාගෙන අඳිනවා. . .මුන් ඊට උඩින් බැනර් පෝස...,HOF


In [None]:
from tqdm import tqdm
tqdm.pandas()
df1['english_translation'] = df1['text'].progress_apply(translate_sinhala_to_english)

100%|██████████| 1875/1875 [21:40<00:00,  1.44it/s]


In [None]:
df1.to_csv("df1.csv")

In [None]:
# df['english_translation'] = df['text'].apply(translate_sinhala_to_english)
# from tqdm import tqdm
# tqdm.pandas()
# df['english_translation'] = df['text'].progress_apply(translate_sinhala_to_english)

In [None]:
# df.to_csv('Task1A.csv')

# Reading Translated Data

In [5]:
# unique_labels = df['label'].unique()
# label2id = {label: id for id, label in enumerate(unique_labels)}
# id2label = {id: label for id, label in enumerate(unique_labels)}
# num_labels = len(unique_labels)
unique_labels = ['HOF', 'NOT']
label2id = {'HOF': 0, 'NOT': 1}
id2label = {0: 'HOF', 1: 'NOT'}
num_labels = 2

In [6]:
id2label

{0: 'HOF', 1: 'NOT'}

# Defining Model and Tokenize Dataset

In [8]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_checkpoint = "keshan/SinhalaBERTo"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/721k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/334M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at keshan/SinhalaBERTo and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [None]:
# from datasets import load_dataset
# from transformers import DataCollatorForLanguageModeling
# from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

# # dataset = load_dataset("your_dataset_name")  # Replace with your dataset name
# # dataset = ("your_dataset_name")
# from datasets import Dataset
# dataset = Dataset.from_pandas(df)
# # processed_dataset
# raw_datasets = dataset.train_test_split(test_size=0.1, seed=GLOBAL_SEED, shuffle=True)
# raw_datasets['validation'] = raw_datasets.pop('test')


# def tokenize_function(examples):
#     # return tokenizer(examples["text"], padding="max_length", truncation=True)
#     return tokenizer(examples["english_translation"], padding="max_length", truncation=True)

# tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)#, remove_columns=['text', 'hate_label'])

Map:   0%|          | 0/6750 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

In [10]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, DatasetDict

# Load the dataset using the datasets library
# dataset = load_dataset("your_dataset_name")  # Replace with your dataset name

# Convert the dataset to a pandas DataFrame
# df = dataset.to_pandas()


# Split the dataset into train and test sets while maintaining class balance
# X = df.drop('label', axis=1)  # Replace 'target_column' with your actual target column name
# y = df['label']
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y, random_state=GLOBAL_SEED)

train_df = pd.read_csv('sinhala_train_dataset.csv', index_col = 4)
val_df = pd.read_csv('sinhala_val_dataset.csv', index_col = 4)

# Convert the training and testing sets back to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Tokenization
# tokenizer = AutoTokenizer.from_pretrained("your_pretrained_model")  # Replace with your tokenizer name
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_datasets = train_dataset.map(tokenize_function, batched=True)
tokenized_val_datasets = val_dataset.map(tokenize_function, batched=True)


tokenized_datasets = DatasetDict({"train": tokenized_train_datasets, "validation": tokenized_val_datasets})

Map:   0%|          | 0/6750 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/750 [00:00<?, ? examples/s]

In [None]:
# # Convert the train and validation datasets to pandas DataFrames
# train_df = tokenized_datasets["train"].to_pandas()
# val_df = tokenized_datasets["validation"].to_pandas()

# # Save train and validation DataFrames to CSV files
# train_df.to_csv("sinhala_train_dataset.csv", index=False)
# val_df.to_csv("sinhala_val_dataset.csv", index=False)

In [11]:
tokenized_datasets
# tokenized_train_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'hate_label', 'english_translation', 'label', 'post_id', 'input_ids', 'attention_mask'],
        num_rows: 6750
    })
    validation: Dataset({
        features: ['text', 'hate_label', 'english_translation', 'label', 'post_id', 'input_ids', 'attention_mask'],
        num_rows: 750
    })
})

In [12]:
# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=False  # Set to True if your task is Masked Language Modeling
# )
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Train Model

In [13]:
training_args = TrainingArguments(
    output_dir="./output",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    learning_rate=2e-5,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    # train_dataset=processed_dataset["train"],
    # eval_dataset=processed_dataset["validation"],
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator
)

# trainer.train(resume_from_checkpoint="last-checkpoint")
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.5785,0.446706


Epoch,Training Loss,Validation Loss
1,0.5785,0.446706
2,0.3628,0.443479
3,0.2703,0.721982
4,0.1964,0.885206
5,0.1146,1.013025


TrainOutput(global_step=4220, training_loss=0.2939986970187363, metrics={'train_runtime': 677.7868, 'train_samples_per_second': 49.794, 'train_steps_per_second': 6.226, 'total_flos': 1431850978539000.0, 'train_loss': 0.2939986970187363, 'epoch': 5.0})

In [15]:
predictions = trainer.predict(tokenized_datasets["validation"])
# print(predictions.predictions.shape, predictions.label_ids.shape)
preds = np.argmax(predictions.predictions, axis=-1)

In [16]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Evaluate on validation data
results = trainer.evaluate()

# Calculate metrics
validation_loss = results["eval_loss"]
accuracy = accuracy_score(tokenized_datasets["validation"]["label"], preds)
precision, recall, f1, _ = precision_recall_fscore_support(tokenized_datasets["validation"]["label"], preds, average="weighted")

# Print the metrics
print(f"Validation Loss: {validation_loss}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Validation Loss: 0.443478524684906
Accuracy: 0.796
Precision: 0.8002355634407371
Recall: 0.796
F1 Score: 0.7969274882823268


In [17]:
test_df = pd.read_csv('sinhala_test_translated.csv', index_col = 0)


from datasets import load_dataset
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, DatasetDict

# Convert the training and testing sets back to datasets
test_dataset = Dataset.from_pandas(test_df)


tokenized_test_datasets = test_dataset.map(tokenize_function, batched=True)


tokenized_datasets["test"] = tokenized_test_datasets

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [18]:
# Save model predictions on the test dataset
test_predictions = trainer.predict(tokenized_datasets["test"])
test_preds = np.argmax(test_predictions.predictions, axis=-1)

# Save the predictions to a file (e.g., CSV)
# test_predictions_df = pd.DataFrame({"predictions": test_preds})
test_predictions_df = test_df.copy()
test_predictions_df['predictions'] = test_preds
test_predictions_df.to_csv("test_predictions_sinhalaRoberta.csv")

In [None]:
# tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'hate_label', 'english_translation', 'label', 'post_id', 'input_ids', 'attention_mask'],
        num_rows: 6750
    })
    validation: Dataset({
        features: ['text', 'hate_label', 'english_translation', 'label', 'post_id', 'input_ids', 'attention_mask'],
        num_rows: 750
    })
    test: Dataset({
        features: ['text', 'english_translation', 'post_id', 'input_ids', 'attention_mask'],
        num_rows: 2500
    })
})

In [None]:
# raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'hate_label', 'english_translation', 'label', 'post_id'],
        num_rows: 6750
    })
    validation: Dataset({
        features: ['text', 'hate_label', 'english_translation', 'label', 'post_id'],
        num_rows: 750
    })
})

In [None]:
# # Convert the train and validation datasets to pandas DataFrames
# train_df = raw_datasets["train"].to_pandas()
# val_df = raw_datasets["validation"].to_pandas()

# # Save train and validation DataFrames to CSV files
# train_df.to_csv("sinhala_train_dataset.csv", index=False)
# val_df.to_csv("sinhala_val_dataset.csv", index=False)

In [None]:
# # Convert the train and validation datasets to pandas DataFrames
# train_df = raw_datasets["train"].to_pandas()
# val_df = raw_datasets["validation"].to_pandas()

# # Save train and validation DataFrames to CSV files
# train_df.to_csv("sinhala_train_dataset.csv", index=False)
# val_df.to_csv("sinhala_val_dataset.csv", index=False)