In [1]:
# model_name = "facebook/opt-350m"
# model_name = "meta-llama/Llama-2-7b-hf"
# model_name = "openai-community/gpt2"
model_name = "EleutherAI/gpt-neo-125m"

In [2]:
max_length = 1024

def hashtag_format(text, summary):
    prompt = f"### Question: Summarize the following which is surrounded by quotes \"{text}\"\n ### Answer:"
    if (len(summary) > 0):
        prompt += f" {summary}"
    return prompt

def format_text(text, summary):
    limited_text = text[:max_length]
    return hashtag_format(limited_text, summary)

response_template = " ### Answer:"
    

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

dataset = load_dataset("csv", data_files="./datasets/podcast_with_summary.csv", split="train")

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

max_length = 1024

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['text'])):
        txt = example['text'][i]
        text = format_text(txt, example['summary'][i])
        output_texts.append(text)
    
    return output_texts

collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./tmp",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    learning_rate=5e-5,
    gradient_accumulation_steps=8
    )

trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    eval_dataset=dataset,
    args=training_args,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
)

trainer.train()

  from .autonotebook import tqdm as notebook_tqdm
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 30/30 [00:32<00:00,  1.07s/it]

{'train_runtime': 32.2489, 'train_samples_per_second': 29.675, 'train_steps_per_second': 0.93, 'train_loss': 2.8078570048014324, 'epoch': 3.0}





TrainOutput(global_step=30, training_loss=2.8078570048014324, metrics={'train_runtime': 32.2489, 'train_samples_per_second': 29.675, 'train_steps_per_second': 0.93, 'total_flos': 335163907630080.0, 'train_loss': 2.8078570048014324, 'epoch': 3.0})

In [4]:
trainer.save_model("./tuned_model")
tokenizer.save_pretrained("./tuned_tokenizer")

('./tuned_tokenizer\\tokenizer_config.json',
 './tuned_tokenizer\\special_tokens_map.json',
 './tuned_tokenizer\\vocab.json',
 './tuned_tokenizer\\merges.txt',
 './tuned_tokenizer\\added_tokens.json',
 './tuned_tokenizer\\tokenizer.json')

In [5]:
import pandas as pd

df = pd.read_csv("./datasets/podcast_with_summary.csv")
first_text = df['text'][0]
expected_summary = df['summary'][0]

prompt = format_text(first_text, "")

print(len(prompt))
print(prompt)
print(expected_summary)

1107
### Question: Summarize the following which is surrounded by quotes "As part of MIT course 6S099, Artificial General Intelligence, I've gotten the chance to sit down with Max Tegmark. He is a professor here at MIT. He's a physicist, spent a large part of his career studying the mysteries of our cosmological universe. But he's also studied and delved into the beneficial possibilities and the existential risks of artificial intelligence. Amongst many other things, he is the cofounder of the Future of Life Institute, author of two books, both of which I highly recommend. First, Our Mathematical Universe. Second is Life 3.0. He's truly an out of the box thinker and a fun personality, so I really enjoy talking to him. If you'd like to see more of these videos in the future, please subscribe and also click the little bell icon to make sure you don't miss any videos. Also, Twitter, LinkedIn, agi.mit.edu if you wanna watch other lectures or conversations like this one. Better yet, go read

In [6]:
def parse_summary_from_response(response):
    # find the start token
    start_token = response.find(response_template)
    if start_token == -1:
        return None
    # trim from end of the start token to the end of the response
    result = response[start_token + len(response_template):]
    
    # attempt to trim any of the cut off sentences
    # reverse find the last punctuation
    last_punctuation = -1
    for p in ['.', '!', '?']:
        last_punctuation = result.rfind(p)
        if last_punctuation != -1:
            break
    if last_punctuation != -1:
        result = result[:last_punctuation + 1]
    return result

In [12]:
max_new_tokens = 32

In [13]:
from transformers import pipeline

# Load the fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("./tuned_model")
tokenizer = AutoTokenizer.from_pretrained("./tuned_tokenizer")

# Create a text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device="cuda", truncation=True)

# Generate text
generated_text = generator(prompt, max_new_tokens=max_new_tokens, num_return_sequences=1)

print(len(generated_text))
print(generated_text[0]["generated_text"])

summary = parse_summary_from_response(generated_text[0]["generated_text"])
print("\nParsed Summary: " + summary)

1
### Question: Summarize the following which is surrounded by quotes "As part of MIT course 6S099, Artificial General Intelligence, I've gotten the chance to sit down with Max Tegmark. He is a professor here at MIT. He's a physicist, spent a large part of his career studying the mysteries of our cosmological universe. But he's also studied and delved into the beneficial possibilities and the existential risks of artificial intelligence. Amongst many other things, he is the cofounder of the Future of Life Institute, author of two books, both of which I highly recommend. First, Our Mathematical Universe. Second is Life 3.0. He's truly an out of the box thinker and a fun personality, so I really enjoy talking to him. If you'd like to see more of these videos in the future, please subscribe and also click the little bell icon to make sure you don't miss any videos. Also, Twitter, LinkedIn, agi.mit.edu if you wanna watch other lectures or conversations like this one. Better yet, go read Ma

In [14]:
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device="cuda", truncation=True)

# Generate text
generated_text = generator(prompt, max_new_tokens=max_new_tokens, num_return_sequences=1)

print(generated_text[0]["generated_text"])

summary = parse_summary_from_response(generated_text[0]["generated_text"])
print("\nParsed Summary: " + summary)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


### Question: Summarize the following which is surrounded by quotes "As part of MIT course 6S099, Artificial General Intelligence, I've gotten the chance to sit down with Max Tegmark. He is a professor here at MIT. He's a physicist, spent a large part of his career studying the mysteries of our cosmological universe. But he's also studied and delved into the beneficial possibilities and the existential risks of artificial intelligence. Amongst many other things, he is the cofounder of the Future of Life Institute, author of two books, both of which I highly recommend. First, Our Mathematical Universe. Second is Life 3.0. He's truly an out of the box thinker and a fun personality, so I really enjoy talking to him. If you'd like to see more of these videos in the future, please subscribe and also click the little bell icon to make sure you don't miss any videos. Also, Twitter, LinkedIn, agi.mit.edu if you wanna watch other lectures or conversations like this one. Better yet, go read Max'