<a href="https://colab.research.google.com/github/navidTerraNova/crossword-puzzle-clue-generation-using-LLMs/blob/main/T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PACKAGES INSTALLATION**

In [None]:
!pip install rouge_score nltk transformers transformers[sentencepiece] datasets

# **MOUNTING GOOGLE DRIVE**

In [None]:
import pandas as pd
from google.colab import drive
drive.mount("/content/drive")

# **LOADING DATAFRAME**

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DataSet/nytcrosswords.csv", encoding='latin1')

## **NAN VALUE CHECK**

In [None]:
counter = 0
for i in range(len(df["Word"])):
  if df["Word"][i] == "NAN":
    counter = counter + 1
    print("found None Value")
    print(df["Word"][i])
    print(i)

print(counter)

In [None]:
counter = 0
for i in range(len(df["Word"])):
  if df["Word"][i] != df["Word"][i] :
    counter = counter + 1
    print("found None Value")
    print(df["Word"][i])
    print(i)

print(counter)

# **DATA PREPROCESS**

In [None]:
df = df.drop(["Date"], axis = 1)
df["Word"] = df["Word"].str.lower()
df = df[~df["Clue"].str.contains("___")]
df = df.dropna()
df = df[~df["Word"].str.contains("nan")]
df = df.drop_duplicates()
df = df.reset_index(drop = True)

In [None]:
df = df.sample(frac=1, random_state=2)

In [None]:
df = df[:500000]

In [None]:
df.to_csv('NYT-Crossword-Cleaned.csv', index = False)

# **LOADING DATASET**

In [None]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files = "/content/NYT-Crossword-Cleaned.csv")

## **TRAIN_VALID_TEST**

In [None]:
dataset = dataset["train"].train_test_split(test_size=0.02, shuffle=False)

In [None]:
datasets_train_validation = dataset["train"].train_test_split(test_size=5000)

dataset["train"] = datasets_train_validation["train"]
dataset["validation"] = datasets_train_validation["test"]

In [None]:
dataset["train"] = dataset["train"].shuffle().select(range(50000))
dataset["validation"] = dataset["validation"].shuffle().select(range(2000))

# **TOKENIZE**

In [None]:
from transformers import AutoTokenizer
model_checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def preprocess_function(examples):

  model_inputs = tokenizer(examples["Word"], max_length = 16, truncation=True, padding = True)

  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples["Clue"], max_length = 32, truncation=True, padding = True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched = True)

In [None]:
print(tokenized_datasets["train"]["Clue"][2],tokenized_datasets["train"]["labels"][2])

In [None]:
tokenized_datasets

# **SETTING TRAINING ARGUMENTS**

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
batch_size = 32
model_name = "FTT5-(500000)"
model_dir = f"drive/MyDrive/Models/{model_name}"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=2000,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

# **METRIC SETUP**

In [None]:
from datasets import load_metric
metric = load_metric("rouge")

In [None]:
import nltk
nltk.download('punkt')

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip()))
                      for label in decoded_labels]

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# **TRAINER API SETUP**

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:

# Function that returns an untrained model to be trained
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# **PLOT**

In [None]:
# Start TensorBoard before training to monitor it in progress
%load_ext tensorboard
%tensorboard --logdir '{model_dir}'/runs

In [None]:
%reload_ext tensorboard

In [None]:
trainer.train(resume_from_checkpoint=True)



  0%|          | 0/8000 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
8200,0.852,0.894035,4.3128,0.4357,4.2247,4.2418,7.7073
8400,0.8416,0.892978,4.3701,0.4579,4.2927,4.3063,7.3153
8600,0.8462,0.891503,4.374,0.433,4.2942,4.3148,7.378
8800,0.8618,0.890067,4.5149,0.4307,4.4232,4.444,7.8852
9000,0.8369,0.88952,4.6899,0.5044,4.6004,4.6226,8.0124
9200,0.8239,0.888058,4.5898,0.4777,4.4996,4.5145,7.6775
9400,0.8443,0.887016,4.6691,0.5016,4.5797,4.5923,7.8354
9600,0.8472,0.886131,4.7336,0.4887,4.6474,4.6682,8.0279
9800,0.8389,0.885407,4.669,0.4879,4.5776,4.5943,7.7797
10000,0.8263,0.884348,4.7006,0.4976,4.6074,4.6258,7.7606


KeyboardInterrupt: ignored

# **INFERENCE**

In [None]:
model_name = "FTT5-(300000)/checkpoint-35000"
model_dir = f"drive/MyDrive/Models/{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 16

In [None]:
for i in range(40):
  inputs = "history"

  inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
  output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=1, max_length=15)
  decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
  predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]

  print(predicted_title)