In [None]:
import torch
from datasets import Dataset
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments
import os
import pickle

model_name = 'bert-base-multilingual-cased'
# model_name = 'xlm-roberta-base'

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load your own corpus
def load_custom_corpus(file_path):
    import pandas as pd
    df = pd.read_csv(file_path)
    df = df.fillna('')
    df = df[df['data_type'] == ''] # only take rows that do not belong to train/dev/test of IFD-EN-5203
    sentences = [sent for sent in df['text']]
    return {'text': sentences}

def tokenize_function(examples):
    return tokenizer((examples['text']), truncation=True, max_length=128, padding='max_length')

# file_path_en = '/home/pgajo/working/data/datasets/English/Incels.is/IFC-22-EN_datatype.csv'  # Replace this with the path to your corpus file
# corpus_en = load_custom_corpus(file_path_en)
# dataset_en = Dataset.from_dict(corpus_en)
# tokenized_dataset_en = dataset_en.map(tokenize_function, batched=True, remove_columns=['text'])

# filename_en = 'IFC-22-EN_empty_datatype.pickle'

# if not os.path.isfile(filename_en):
#     # Save the tokenized_dataset as a pickle file
#     with open(filename_en, 'wb') as file:
#         pickle.dump(tokenized_dataset_en, file)
# else:
#     print(f"{filename_en} already exists. Not overwriting.")

# Load the tokenized_dataset from the pickle file
with open('/home/pgajo/working/data/datasets/English/Incels.is/IFC-22-EN_empty_datatype.pickle', 'rb') as file:
    tokenized_dataset_en = pickle.load(file)

file_path_it = '/home/pgajo/working/data/datasets/Italian/Il_forum_dei_brutti/IFC-22-IT.csv'  # Replace this with the path to your corpus file
corpus_it = load_custom_corpus(file_path_it)
dataset_it = Dataset.from_dict(corpus_it)
tokenized_dataset_it = dataset_it.map(tokenize_function, batched=True, remove_columns=['text'])

filename_it = file_path_it.split('.csv')[0]+'_empty_datatype.pickle'

if not os.path.isfile(filename_it):
    # Save the tokenized_dataset as a pickle file
    with open(filename_it, 'wb') as file:
        pickle.dump(tokenized_dataset_it, file)
else:
    print(f"{filename_it} already exists. Not overwriting.")

In [None]:
# Set a seed to ensure reproducibility when shuffling
seed = 42
sample_n = 5_000
tokenized_dataset_en = tokenized_dataset_en.shuffle(seed=seed)
tokenized_dataset_en = tokenized_dataset_en.select(range(sample_n))
tokenized_dataset_it = tokenized_dataset_it.shuffle(seed=seed)
tokenized_dataset_it = tokenized_dataset_it.select(range(sample_n))
print(tokenized_dataset_en)
print(tokenized_dataset_it)

In [None]:
from datasets import concatenate_datasets

# Assuming you have loaded the two datasets as `dataset1` and `dataset2`
merged_dataset = concatenate_datasets([tokenized_dataset_en, tokenized_dataset_it])
shuffled_dataset = merged_dataset.shuffle(seed=seed)
shuffled_dataset

In [None]:
# # Load the tokenized_dataset from the pickle file
# with open('/home/pgajo/working/data/datasets/English/Incels.is/IFC-22-EN_empty_datatype.pickle', 'rb') as file:
#     tokenized_dataset_en = pickle.load(file)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

model = AutoModelForMaskedLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    report_to='none',
    disable_tqdm = False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=shuffled_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train(
    # resume_from_checkpoint = True
    )

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import os

# Replace these with the appropriate model and tokenizer names
new_model_name = 'incel-'+model_name+'-'+str(int(len(shuffled_dataset)/1000))+'k'

# Save the model and tokenizer to a directory
output_dir = "/home/pgajo/working/pt_models"

# Create the directory if it doesn't exist
model_path = os.path.join(output_dir,new_model_name)

if not os.path.exists(model_path):
    os.makedirs(model_path)
model_path

In [None]:
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)