In [None]:
import pandas as pd 
import re 
import warnings 
warnings.filterwarnings("ignore")
from transformers import AutoModelForMaskedLM,AutoTokenizer,TrainingArguments,\
LineByLineTextDataset , DataCollatorForLanguageModeling,Trainer

In [None]:
tr_data = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
ts_data = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
data = pd.concat([tr_data,ts_data])

In [None]:
def clean_text(excerpt):
    punctuations = ".,?!;\(\":-)‘"
    extrait = excerpt
    for p in punctuations : 
      extrait = extrait.replace(p,f" {p} ")
    extrait = re.sub(r"'s"," is ",extrait)
    extrait = extrait.replace("i'm","I'm")
    extrait = extrait.replace("don't","do not")
    extrait = extrait.replace("didn't","did not")
    extrait = extrait.replace("can't","cannot")
    extrait = extrait.replace("i'll","I will")
    extrait = extrait.replace("wouldn't","would not")
    extrait = extrait.replace("i've","I have")
    extrait = re.sub(r"i've","I have",extrait)
    extrait = extrait.replace("won't","will not")
    extrait = extrait.replace("couldn't","could not")
    extrait = extrait.replace("wasn't","was not")
    extrait = extrait.replace("you'll","you will")
    extrait = extrait.replace("isn't","is not")
    extrait = extrait.replace("you're","you are")
    extrait = extrait.replace("hadn't","had not")
    extrait = extrait.replace("you've","you have")
    extrait = extrait.replace("doesn't","does not")
    extrait = extrait.replace("haven't","have not")
    extrait = extrait.replace("they're","they are")
    extrait = extrait.replace("we're","we are")
    #extrait = re.sub(r"(/s+)i(/s+)","I",excerpt)
    #extrait = re.sub(r"don't","do not",extrait)
    #extrait = re.sub(r"i'm","I'm",extrait)
    #extrait = re.sub(r"man's","man is",extrait)
    #extrait = re.sub(r"it's","it is",extrait)
    #extrait = re.sub(r"didn't","did not",extrait)
    #extrait = re.sub(r"can't","cannot",extrait)
    #extrait = re.sub(r"earth's","earth is",extrait)
    #extrait = re.sub(r"father's","father is",extrait)
    #extrait = re.sub(r"i'll","I will",extrait)
    #extrait = re.sub(r"i've","I have",extrait)
    #extrait = re.sub(r"i\'",r"I'",extrait)
    #extrait = re.sub(r"children\'s","children is",extrait)
    
    return extrait 

In [None]:
data["cleaned_excerpt"] = data["excerpt"].map(clean_text)

In [None]:
model_name = "roberta-base"
model = AutoModelForMaskedLM.from_pretrained(model_name)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
texts = "\n".join(data["cleaned_excerpt"].values.tolist())

In [None]:
with open("texts.txt","w") as f :
    f.write(texts)

In [None]:
dataset = LineByLineTextDataset(tokenizer=tokenizer,file_path="./texts.txt",block_size=256)
val_dataset = LineByLineTextDataset(tokenizer=tokenizer,file_path="./texts.txt",\
                                   block_size = 256)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=True,mlm_probability=\
                                               0.15)

In [None]:
training_args= TrainingArguments(output_dir="./clrp_roberta_base_chk", #select model path for checkpoint
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    evaluation_strategy= 'steps',
    save_total_limit=0, 
    eval_steps=150,
    metric_for_best_model = 'eval_loss',
    greater_is_better=False,
    load_best_model_at_end =True,
    prediction_loss_only=True,
    report_to = "none")

trainer = Trainer(model = model,
                  args = training_args,
                  data_collator = data_collator,
                  train_dataset = dataset,
                  eval_dataset= val_dataset)

In [None]:
trainer.train()
trainer.save_model("./clrp_roberta_base_chk")