In [1]:
!pip install transformers
!pip install datasets
!pip install mlflow
!pip install torch

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:0

In [2]:

!pip install pyngrok -q

In [3]:

import subprocess
from pyngrok import ngrok, conf
import getpass

In [6]:
import os
import mlflow
import mlflow.pytorch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict
from transformers import EarlyStoppingCallback
import torch

In [7]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
subprocess.Popen(["mlflow", "ui", "--backend-store-uri", MLFLOW_TRACKING_URI])

<Popen: returncode: None args: ['mlflow', 'ui', '--backend-store-uri', 'sqli...>

In [20]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("duration-prediction-experiment")

<Experiment: artifact_location='/content/mlruns/1', creation_time=1728775149077, experiment_id='1', last_update_time=1728775149077, lifecycle_stage='active', name='duration-prediction-experiment', tags={}>

In [21]:
print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth")
conf.get_default().auth_token = getpass.getpass()
port=5000
public_url = ngrok.connect(port).public_url
print(f' * ngrok tunnel \"{public_url}\" -> \"http://127.0.0.1:{port}\"')

Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth
··········
 * ngrok tunnel "https://4c38-34-75-140-112.ngrok-free.app" -> "http://127.0.0.1:5000"


In [22]:
import os
import mlflow
import mlflow.pytorch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict
from transformers import EarlyStoppingCallback
import torch

os.makedirs("mlruns", exist_ok=True)
mlflow.end_run()

mlflow.start_run()

data_files = 'cleaned_creative_writing_dataset.csv'
dataset = load_dataset('csv', data_files=data_files)
dataset = dataset['train'].remove_columns(['text'])
dataset = dataset.rename_column('cleaned_text', 'text')

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    input_ids = tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=32,
    )
    input_ids['labels'] = input_ids['input_ids'].copy()
    return input_ids

tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
tokenized_datasets = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = (predictions == labels).mean()
    return {'accuracy': accuracy}


learning_rate = 2e-5
per_device_train_batch_size = 1
num_train_epochs = 1
max_length = 32

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    no_cuda=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

mlflow.log_param("data_files", data_files)
mlflow.log_param("learning_rate", learning_rate)
mlflow.log_param("per_device_train_batch_size", per_device_train_batch_size)
mlflow.log_param("num_train_epochs", num_train_epochs)
mlflow.log_param("max_length", max_length)
mlflow.log_param("model_name", "gpt2")

trainer.train()

model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

mlflow.pytorch.log_model(model, "fine_tuned_gpt2")

eval_metrics = trainer.evaluate()

if 'loss' in trainer.state.log_history[-1]:
    train_loss = trainer.state.log_history[-1]['loss']
else:
    train_loss = None

mlflow.log_metric("Training Loss", train_loss if train_loss is not None else 0.0)
mlflow.log_metric("Validation Loss", eval_metrics['eval_loss'])
mlflow.log_metric("Accuracy", eval_metrics.get('eval_accuracy', 0.0))

mlflow.end_run()

print("Model training and saving completed.")

from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "./fine_tuned_gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

model.eval()

def generate_story(prompt, max_length=1000, temperature=1.5, top_k=100):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            top_k=top_k,
            do_sample=True,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_story = tokenizer.decode(output[0], skip_special_tokens=True)

    mlflow.start_run()
    mlflow.log_param("generation_max_length", max_length)
    mlflow.log_param("temperature", temperature)
    mlflow.log_param("top_k", top_k)
    mlflow.log_param("prompt", prompt)
    mlflow.end_run()

    return generated_story

prompt = "Write a story about a girl's adventures in a magical forest where she finds strange creatures"
generated_text = generate_story(prompt, max_length=1000)
print(generated_text)


2024/10/13 02:50:15 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id b7c26c61f0c549afbd3b631714897a58: Failed to log run data: Exception: Changing param values is not allowed. Params were already logged='[{'key': 'max_length', 'old_value': '32', 'new_value': '20'}]' for run ID='b7c26c61f0c549afbd3b631714897a58'.


Epoch,Training Loss,Validation Loss,Accuracy
1,5.6026,5.522341,0.164226


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Model training and saving completed.
Write a story about a girl's adventures in a magical forest where she finds strange creatures always found dreams may find courage use her book "It goes well story" saying first time looking pachyderm ever wrote wrote story felt i had never done novel several people asked storyboard storyboard artist kelvia kartan first introduced rozja work first book made dylan shulke book gave kartan good first time author josh duet received first issue new set introduced first issue early one morning roaz wilman introduced character kartan idea dream like girl rozi is born alittle staig minning best writers ryan james raul branly kartan called book became first ever econ one reading co write volume chronicles series characters love laura pfry best novel storyboard cartoon rozi used cartoon writing kartan wanted author to do companion series


In [24]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.0.2-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.2-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.0 (from gradio)
  Downloading gradio_client-1.4.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (fr

In [26]:
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the fine-tuned GPT-2 model and tokenizer
model_name = "./fine_tuned_gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

model.eval()

# Define the generation function
def generate_story(prompt, max_length=1000, temperature=1.5, top_k=100):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            top_k=top_k,
            do_sample=True,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )

    generated_story = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_story

# Define Gradio interface
def gradio_generate(prompt):
    generated_text = generate_story(prompt)
    return generated_text

# Create Gradio interface
gradio_interface = gr.Interface(
    fn=gradio_generate,
    inputs="text",
    outputs="text",
    title="Story Generator",
    description="Enter a prompt to generate a story using the fine-tuned GPT-2 model.",
)

# Launch the interface
gradio_interface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://fc0d79ba43be8fbfba.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


