<h2>Install & import required packages<h2>

In [None]:
!pip install transformers
!pip install transformers torch
!pip install datasets
!pip install evaluate
!pip install rouge_score

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import evaluate
import nltk

from PIL import Image
from transformers import AutoProcessor, AutoModelForSeq2SeqLM, AutoModelForVision2Seq, AutoTokenizer, AutoModelForCausalLM, pipeline, DataCollatorForLanguageModeling, Trainer, TrainingArguments, GenerationConfig, T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
from datasets import load_dataset
from nltk.tokenize import SyllableTokenizer
from nltk import word_tokenize
from evaluate import load

<h2>Load models<h2>

In [None]:
# img2word model (kosmos) from https://huggingface.co/microsoft/led-base-16384
kosmos_name = "microsoft/kosmos-2-patch14-224"                    
kosmos_model = AutoModelForVision2Seq.from_pretrained(kosmos_name)
kosmos_processor = AutoProcessor.from_pretrained(kosmos_name)

In [None]:
### load word to haiku model
model_name = "fabianmmueller/deep-haiku-gpt-2"

syllable_tokenizer = SyllableTokenizer()
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False, return_tensors = "pt")

<h2>Image2Word</h2>

In [None]:
prompt = "<grounding>An image of"

# User inputted image
image = Image.open("data/snowman.jpg")

inputs = kosmos_processor(text=prompt, images=image, return_tensors="pt")

generated_ids = kosmos_model.generate(
    pixel_values=inputs["pixel_values"],
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    image_embeds=None,
    image_embeds_position_mask=inputs["image_embeds_position_mask"],
    use_cache=True,
    max_new_tokens=128,
)
generated_text = kosmos_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# Specify `cleanup_and_extract=False` in order to see the raw model generation.
processed_text = kosmos_processor.post_process_generation(generated_text, cleanup_and_extract=False)

# print(processed_text)
# `<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.`

# By default, the generated  text is cleanup and the entities are extracted.
processed_text, entities = kosmos_processor.post_process_generation(generated_text)

kosmos_output = entities[0][0]
print(kosmos_output)

<h2>Word2Haiku</h2>

<h3>Load Pretrained Model</h3>

In [None]:
perplexity = load("perplexity", module_type="metric")

In [None]:
### load pretrained model

pt_model_name = "fabianmmueller/deep-haiku-gpt-2"

syllable_tokenizer = SyllableTokenizer()
tokenizer = AutoTokenizer.from_pretrained(pt_model_name)
pt_model = AutoModelForCausalLM.from_pretrained(pt_model_name)

In [None]:
keyword_model_name = "Voicelab/vlt5-base-keywords"
keyword_model = T5ForConditionalGeneration.from_pretrained(keyword_model_name)
keyword_tokenizer = T5Tokenizer.from_pretrained(keyword_model_name)

<h3>Define Variables</h3>

In [None]:
max_length = 40
batch_size = 64
device = torch.device('mps')

<h3>Helper Functions</h3>

In [None]:
### for tokenizing data

def preprocess_haikus(data):
    inputs = []
    targets = []
    for i in range(len(data['keywords'])):
        keyw = "( " + data['keywords'][i] + " = "
        inputs.append(keyw )
        targets.append(keyw+ data["text"][i] + ")")
    # inputs = ["( " + chunk + " =" for chunk in data["keywords"]]
    # targets = [sentence2syllables(chunk) for chunk in data["text"]]
    # targets = [(chunk) for chunk in data["text"]]
    return (tokenizer(inputs, text_target = targets, padding = 'max_length', truncation=True, max_length=max_length, return_tensors = 'pt'))

In [None]:
### Clean up results from the pretrained model

def clean_result(result):
    result_text = result[0]['generated_text']
    start = result_text.find("=")
    end = result_text.find(")")
    start = start if start!=-1 else 0
    end = end if end!=-1 else len(result_text)
    return result_text[start+1:end].strip()

In [None]:
### Helper function to write to json file

import json 
    
# Convert and write JSON object to file
def write_json(data, filename):
    with open(filename, "w") as outfile: 
        json.dump(data, outfile)

In [None]:
### Helper function to update json file as we preprocess dataset 2

def update_json(new_data, filename='dataset2.json', key="keywords"):
    with open(filename,'r+') as file:
          # First we load existing data into a dict.
        file_data = json.load(file)
        # Join new_data with file_data inside emp_details
        file_data[key].extend(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [None]:
# not used
# data is a batch
# add a key "syllables" into the dictionary to have text tokenized into syllables
def intoSyllables(data):
    processed = [d.replace(' /', '') for d in data['text']]
    sentences = [sentence.split(' ') for sentence in processed]
    syllables = [[syllable_tokenizer.tokenize(word.lower()) for word in sentence] for sentence in sentences ]
    for i in range(len(syllables)): 
        syllables[i] = ['syllables: '] + [item for sublist in syllables[i] for item in sublist]
        data['keywords'][i] = "keywords: "+ data['keywords'][i]
    data['syllables'] = syllables
    return data

def sentence2syllables(sentence):
    # sentence = sentence.replace('/ ', '')
    words = sentence.split(' ')
    syllables = [syllable_tokenizer.tokenize(word) for word in words]
    return ' '.join([item for sublist in syllables for item in sublist])

<h3>Loading datasets</h3>

<h3> Dataset 1 </h3>

In [None]:
### load dataset 1 (same as what the pretrained model was trained on)

haikus = load_dataset("statworx/haiku")
print(haikus)
print(haikus['train'][0])

In [None]:
### Splitting dataset 1 into train and test

haikus = haikus["train"].train_test_split(test_size=0.001)
tokenized_haikus = {}
tokenized_haikus['test'] = haikus['test'].map(preprocess_haikus, batched = True, batch_size=batch_size, remove_columns=['source', 'text_phonemes', 'keyword_phonemes', 'gruen_score', 'text_punc'])
haikus = haikus['train'].train_test_split(test_size=0.1)
tokenized_haikus['train'] = haikus['train'].map(preprocess_haikus, batched = True, batch_size=batch_size, remove_columns=['source', 'text_phonemes', 'keyword_phonemes', 'gruen_score', 'text_punc'])
tokenized_haikus['validation'] = haikus['test'].map(preprocess_haikus, batched = True, batch_size=batch_size, remove_columns=['source', 'text_phonemes', 'keyword_phonemes', 'gruen_score', 'text_punc'])


In [None]:
print(tokenized_haikus)

print(tokenized_haikus['train']['text'][0])
print(tokenized_haikus['train']['labels'][0])
print(tokenizer.decode(tokenized_haikus['train']['labels'][0]))

print(tokenized_haikus['train']['keywords'][0])
print(tokenized_haikus['train']['input_ids'][0])
print(tokenizer.decode(tokenized_haikus['train']['input_ids'][0]))

<h3>Dataset 2 ( + preprocessing)</h3>

In [None]:
### Read dataset from txt file into a dictionary
### RUN ONCE ONLY

# data2 = {'text': [],
#          'keywords':[]}
# f = open("dataset2.txt", "r")
# content = [ln for ln in f]
# for line in range(0, len(content), 5):
#     data2['text'].append(content[line].strip() + ". / " +  content[line+1].strip() + ". / " + content[line+2].strip() + ". ") 
# print(data2['text'][0])
# f.close()


In [None]:
### Preprocess new trianing data from web 
### RUN ONCE ONLY

# task_prefix = "Keywords: "
# inputs = data2['text']
# count = 0

# write_json(data2, "dataset2.json")
# total = 0

# for sample in inputs:
#     input_sequences = [task_prefix + sample]
#     input_ids = keyword_tokenizer(
#         input_sequences, return_tensors="pt", truncation=True
#     ).input_ids
#     output = keyword_model.generate(input_ids, no_repeat_ngram_size=1, num_beams=5)
#     predicted = keyword_tokenizer.decode(output[0], skip_special_tokens=True)
#     data2['keywords'].append(predicted.strip().split(',')[0])
#     count+=1
#     if (count==100):
#         print(sample)
#         print(data2['keywords'][-1])
#         update_json(data2['keywords'], "dataset2.json")
#         data2['keywords'] = []
#         count = 0
#         total += 1
    

# print(data2['keywords'][0])

In [None]:
from datasets import Dataset
web_data = load_dataset('json', data_files='web_haikus.json')
dict = {"text": web_data['train']['text'][0][:len(web_data['train']['keywords'][0])], "keywords": web_data['train']['keywords'][0]}
web_data = Dataset.from_dict(dict)

In [None]:
### Splitting dataset 2 into train and test

web_data = web_data.train_test_split(test_size = 0.1)
tokenized_web_data = {}
tokenized_web_data['train'] = web_data['train'].map(preprocess_haikus, batched = True, batch_size=batch_size)
tokenized_web_data['test'] = web_data['test'].map(preprocess_haikus, batched = True, batch_size=batch_size)

In [None]:
print(tokenized_web_data)

print(tokenized_web_data['train']['text'][0])
print(tokenized_web_data['train']['labels'][0])
print(tokenizer.decode(tokenized_haikus['train']['labels'][0]))

print(tokenized_web_data['train']['keywords'][0])
print(tokenized_web_data['train']['input_ids'][0])
print(tokenizer.decode(tokenized_haikus['train']['input_ids'][0]))

<h3>Generating on pretrained model</h3>

In [None]:
### Generating sample outputs with the pretrained model

prompt = "( iced coffee = "

pipe = pipeline(
    "text-generation",
    model=pt_model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

generator = pipeline('text-generation', model = pt_model_name)

result = generator(prompt)

In [None]:
### Cleaning results from pretrained model & measure perplexity in comparison to gpt2

cleaned_result = clean_result(result)
ppl_pt = perplexity.compute(predictions=cleaned_result, model_id=pt_model_name)
print(ppl_pt['mean_perplexity'])
ppl_gpt_pt = perplexity.compute(predictions=cleaned_result, model_id="gpt2")
print(ppl_gpt_pt['mean_perplexity'])

<h1>Training on pretrained model</h1>

In [None]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
pt_model.to(device)

In [None]:
output_dir = "./result9"
# train_batch = 25
# eval_batch= 16
epochs = 2
save_steps = 500
learning_rate=0.01
weight_decay=0.01
save_total_limit=3
logging_steps=200


training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        evaluation_strategy="steps",
        weight_decay = weight_decay,
        logging_steps = logging_steps,
        # learning_rate = learning_rate,
        num_train_epochs=epochs,
        per_device_train_batch_size= batch_size,
        per_device_eval_batch_size = batch_size,
        save_steps=save_steps,
        # fp16=True,
    )

In [None]:
def compute_metrics(eval_pred):
    pred_logits = torch.FloatTensor(eval_pred.predictions) # what transformer returns
    # pred_labels = torch.FloatTensor(eval_pred.label_ids)
    predictions = torch.argmax(pred_logits, -1)
    # decoded_predictions = [tokenizer.decode(predictions) for sen in predictions]
    decoded_predictions = tokenizer.batch_decode(predictions)
    print(decoded_predictions)
    # decoded_labels = tokenizer.batch_decode(pred_labels, skip_special_tokens=True)
    # print(decoded_labels)
    # print("label: ", decoded_labels)
    # print("predictions", decoded_predictions)
    return perplexity.compute(predictions=decoded_predictions, model_id=pt_model_name)
# perplexity = load("perplexity", module_type="metric")
# results = perplexity.compute(predictions=predictions, model_id='gpt2')

In [None]:
small_train_dataset = tokenized_haikus["train"].select(np.arange(len(tokenized_haikus["train"]))[:2000])
small_test_dataset = tokenized_haikus["test"].select(np.arange(len(tokenized_haikus["test"]))[:15])
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False, return_tensors = "pt")

pt_model.to(device)

trainer = Trainer(
    model=pt_model,
    args=training_args,
    train_dataset=tokenized_web_data["train"],
    eval_dataset=tokenized_web_data['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)



In [None]:
trainer.train(resume_from_checkpoint=True)
# trainer.evaluate()
pt_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

<h3> Generate + Evaluate </h3>

In [None]:
generate_input = tokenizer('( iced coffee =', return_tensors='pt').to(device)
trained_model = AutoModelForCausalLM.from_pretrained("./result9")

trained_model.to(device)

generation_config = GenerationConfig(
    num_beams=5,
    early_stopping=True,
    eos_token_id=trained_model.config.eos_token_id,
    pad_token=trained_model.config.pad_token_id,
    no_repeat_ngram_size=1,
    do_sample = True,
)

generation_output = trained_model.generate(**generate_input, generation_config=generation_config)

decoded_output = tokenizer.batch_decode(generation_output, skip_special_tokens=True)

print(decoded_output)


In [None]:
eval_input = tokenizer('( iced coffee = ', return_tensors='pt').to(device)
trained_model.to(device)
pt_model.to(device)

# trained_loss = trained_model(input_ids = eval_input["input_ids"], labels = eval_input["input_ids"]).loss
# pt_loss = pt_model(input_ids = eval_input["input_ids"], labels = eval_input["input_ids"]).loss
# trained_ppl = torch.exp(trained_loss)
# pt_ppl = torch.exp(pt_loss)
# print(trained_ppl)
# print(pt_ppl)
print("ppl: ", perplexity.compute(predictions=decoded_output, model_id=pt_model_name))
print("ppl gpt: ", perplexity.compute(predictions=decoded_output, model_id="gpt2"))

In [None]:
our_results = []
for data in tokenized_haikus['test']:
    data_tok = tokenizer("( " + data['keywords'] + " = ", return_tensors='pt').to(device)
    our_output = trained_model.generate(**data_tok, generation_config=generation_config)
    our_decoded_output = tokenizer.batch_decode(our_output, skip_special_tokens=True)
    our_results.append(our_decoded_output[0])

iced_coffee = tokenizer('( iced coffee = ', return_tensors='pt').to(device)
iced_coffee = trained_model.generate(**iced_coffee, generation_config=generation_config)
iced_coffee = tokenizer.batch_decode(iced_coffee, skip_special_tokens=True)

snowman = tokenizer('( snowman = ', return_tensors='pt').to(device)
snowman = trained_model.generate(**snowman, generation_config=generation_config)
snowman = tokenizer.batch_decode(snowman, skip_special_tokens=True)

### this was part of the dataset
haiku = tokenizer('( haiku = ', return_tensors='pt').to(device)
haiku = trained_model.generate(**haiku, generation_config=generation_config)
haiku = tokenizer.batch_decode(haiku, skip_special_tokens=True)

our_ppl = perplexity.compute(predictions=our_results, model_id=pt_model_name)['mean_perplexity']
our_ppl_gpt2 = perplexity.compute(predictions=our_results, model_id="gpt2")['mean_perplexity']

print("ppl: ", our_ppl)
print("ppl gpt: ", our_ppl_gpt2)
print("iced_coffee: ", iced_coffee)
print("snowman: ", snowman)
print("haiku: ", haiku)


In [None]:
iced_coffee = tokenizer('iced coffee', return_tensors='pt').to(device)
iced_coffee = trained_model.generate(**iced_coffee, generation_config=generation_config)
print(tokenizer.batch_decode(iced_coffee, skip_special_tokens=True))

snowman = tokenizer('snowman', return_tensors='pt').to(device)
snowman = trained_model.generate(**snowman, generation_config=generation_config)
print(tokenizer.batch_decode(snowman, skip_special_tokens=True))

### this was part of the dataset
our_chanting = tokenizer('our chanting', return_tensors='pt').to(device)
our_chanting = trained_model.generate(**our_chanting, generation_config=generation_config)
print(tokenizer.batch_decode(our_chanting, skip_special_tokens=True))

print(clean_result(generator("( iced coffee = ")))
print(clean_result(generator("( snowman = ")))
print((generator("( haiku = ")))
