In [None]:
import torch
from datasets import Dataset
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments
import os
import pickle

# model_name = 'bert-base-uncased'
model_name = 'roberta-base'

tokenizer = AutoTokenizer.from_pretrained(model_name)

# # Load your own corpus
# def load_custom_corpus(file_path):
#     import pandas as pd
#     df = pd.read_csv(file_path)
#     df = df.fillna('')
#     df = df[df['data_type'] == ''] # only take rows that do not belong to train/dev/test of IFD-EN-5203
#     sentences = [sent for sent in df['text']]
#     return {'text': sentences}

# file_path = '/home/pgajo/working/data/datasets/English/Incels.is/IFC-22-EN_datatype.csv'  # Replace this with the path to your corpus file

# corpus = load_custom_corpus(file_path)

# def tokenize_function(examples):
#     return tokenizer((examples['text']), truncation=True, max_length=128, padding='max_length')

# dataset = Dataset.from_dict(corpus)
# tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# filename = 'IFC-22-EN_empty_datatype.pickle'

# if not os.path.isfile(filename):
#     # Save the tokenized_dataset as a pickle file
#     with open(filename, 'wb') as file:
#         pickle.dump(tokenized_dataset, file)
# else:
#     print(f"{filename} already exists. Not overwriting.")

# Load the tokenized_dataset from the pickle file
with open('/home/pgajo/working/data/datasets/English/Incels.is/IFC-22-EN_empty_datatype.pickle', 'rb') as file:
    tokenized_dataset = pickle.load(file)


In [None]:
# Set a seed to ensure reproducibility when shuffling
seed = 42

tokenized_dataset = tokenized_dataset.shuffle(seed=seed)
tokenized_dataset = tokenized_dataset.select(range(100_000))
tokenized_dataset

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

model = AutoModelForMaskedLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    report_to='none',
    disable_tqdm = False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train(
    # resume_from_checkpoint = True
    )

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import os

# Replace these with the appropriate model and tokenizer names
new_model_name = 'incel-'+model_name+'-'+str(int(len(tokenized_dataset)/1000))+'k'

# Save the model and tokenizer to a directory
output_dir = "/home/pgajo/working/pt_models"

# Create the directory if it doesn't exist
model_path = os.path.join(output_dir,new_model_name)

if not os.path.exists(model_path):
    os.makedirs(model_path)
model_path

In [None]:
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)