In [1]:
import pandas as pd
import numpy as np

from transformers import pipeline, set_seed
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer


In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "baidu/ERNIE-4.5-0.3B-PT"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

# prepare the model input
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=1024
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

# decode the generated ids
generate_text = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
print("generate_text:", generate_text)


tokenizer.model:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/742 [00:00<?, ?B/s]

configuration_ernie4_5.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/baidu/ERNIE-4.5-0.3B-PT:
- configuration_ernie4_5.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_ernie4_5.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/baidu/ERNIE-4.5-0.3B-PT:
- modeling_ernie4_5.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/722M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

generate_text: **Short Introduction:** Large language models (LLMs) are artificial intelligence systems that simulate human-like intelligence through natural language processing and computational reasoning. They enable users to engage in complex conversations, generate text, solve puzzles, and perform tasks requiring high-level reasoning, such as writing essays, creating articles, or even understanding human language. Unlike traditional chatbots, LLMs operate independently, processing data from large datasets to provide context-aware responses. Their development has revolutionized communication, productivity, and creativity across industries like healthcare, finance, and education.


In [2]:
generator = pipeline('text-generation', model='gpt2')


Device set to use cuda:0


In [3]:
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': 'Hello, I\'m a language model, and this is why I love language models. I can think of no language more exciting than a language model. It\'s the only way of thinking about the world we\'re living in right now.\n\nNow, I have a lot of work to do.\n\nI\'ve had a lot of good conversations with people like you about the importance of languages, and they\'ve been very supportive. But I want to share with you a great deal of what I think of as a language model.\n\nSo let\'s start with the simplest, most basic, and most basic language model. I use the term "language model" because the way I think of language models is that they\'re the way human beings think about the world. They\'re not built on any particular set of rules but they\'re built on a certain set of rules. We can talk about any language model, because they\'re built on that set of rules. They\'re built on a certain kind of rules. We can talk about any language model, because they\'re built on that set of rules

In [4]:
dreambank = pd.read_csv('dreambank.csv')
dreambank.head(5)

Unnamed: 0,id,name,number,time,date,gender,age,report,character,emotion
0,alta,Alta: a detailed dreamer,1,1985-1997,1957,F,A,"The one at the Meads's house, where it's bigge...","1MSA, 1FSA, 1FKA, 2ISA",
1,alta,Alta: a detailed dreamer,2,1985-1997,8/11/67,F,A,I'm at a family reunion in a large fine house ...,"2MSA, 2JSA",
2,alta,Alta: a detailed dreamer,3,1985-1997,8/1/85,F,A,I watch a plane fly past and shortly realize i...,"2ISA, 1FSA, 2ISA, 1MKA, 1MKA",
3,alta,Alta: a detailed dreamer,4,1985-1997,1985?,F,A,Me pulling the green leaves and berries off so...,"1MSC, 1FKA, 2JSA, 1ANI, 2MSA, 1ANI",
4,alta,Alta: a detailed dreamer,5,1985-1997,1985?,F,A,I'm in a room that reminds me of (but definite...,"1MKA, 2IOA, 1MKA, 2JSA, 1MSA","CO D, AN 1MKA"


In [5]:
name_dream = dreambank['name'].unique()
name_dream

array(['Alta: a detailed dreamer', 'Angie: age 18 & 20',
       'Arlie: a middle-aged woman', 'Barb Sanders', 'Barb Sanders #2',
       'Bay Area girls: Grades 4-6', 'Bay Area girls: Grades 7-9',
       'Bea 1: a high school student', 'Bea 2: a college student',
       'Blind dreamers (F)', 'Blind dreamers (M)',
       'Robert Bosnak: A dream analyst', 'Chris: a transvestite',
       'Chuck: a physical scientist', 'College students, 1997-1998 (F)',
       'College students, 1997-1998 (M)',
       'Dahlia: concerns with appearance', 'David: teenage dreams',
       'Dorothea: 53 years of dreams', 'Ed: dreams of his late wife',
       'Edna: a blind woman', 'Elizabeth: a woman in her 40s',
       'Emma: 48 years of dreams', "Emma's Husband",
       'Esther: an adolescent girl', 'College women, late 1940s',
       'Izzy (ALL, including non-consecutive)',
       'Jasmine (ALL, including non-consecutive)',
       'Jeff: a lucid dreamer', 'Joan: a lesbian', 'Kenneth',
       'Lawrence, a youn

In [6]:
len(dreambank)

27952

In [7]:
dream_token = dreambank[['name', 'report']]
dream_token.tail(10)

Unnamed: 0,name,report
27942,Vietnam Vet: 2016-17 dreams,"With a community of men and several women, I'm..."
27943,Vietnam Vet: 2016-17 dreams,A man bearing a likeness to Donald Trump calls...
27944,Vietnam Vet: 2016-17 dreams,The sparse jungle offers little camouflage. Hi...
27945,Vietnam Vet: 2016-17 dreams,"On a desolate farm, in a dirt lot, a very dark..."
27946,Vietnam Vet: 2016-17 dreams,"At night, in a foreign land, I go from one res..."
27947,Vietnam Vet: 2016-17 dreams,In a town like the town I lived in from 2001 t...
27948,Vietnam Vet: 2016-17 dreams,"At night, as people leave, I'm standing in a f..."
27949,Vietnam Vet: 2016-17 dreams,"I'm in an airplane flying high over the earth,..."
27950,Vietnam Vet: 2016-17 dreams,"As an adult, I'm living at home with brother a..."
27951,Vietnam Vet: 2016-17 dreams,I'm part of a bombing mission over India. As w...


In [8]:
#dream_token.to_csv('dream_token.csv', index=False, header=True)

In [9]:
print(dreambank['report'][0])

The one at the Meads's house, where it's bigger inside than out; there's a European village just inside, with a cobblestone street and a Pied-Piper sort of man with curly hair, he can do things like juggle - I go up the back stairs [there aren't any in the real house] and then down the other side [since there's a second set, immediately] then down a short empty hallway that turns a corner, where I find a tiny room...a young woman with shoulder-length blonde hair in a pageboy is there, cooking at a stove that almost fills the room...she's nice to me. Now outside, I'm waiting for my aunt to pick me up - she arrives in a little round convertible and we go for a drive, not very far - we cross a little bridge over a creek, then double back and she drops me off at the house again. Inside (?) I sit with a couple of people, playing with a string of blue balloons.


In [10]:
emails = pd.read_csv('emails.csv')


In [11]:
print(len(emails))

517401


In [12]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import pandas as pd

In [13]:
# Functions to normalize the data

def load_csv_dataset(csv_path):
    df = pd.read_csv(csv_path)
    if 'name' not in df.columns or 'report' not in df.columns:
        raise ValueError("CSV must contain 'prompt' and 'result' columns.")

    conversations = df.apply(lambda row: f"User: {row['name']}\nBot: {row['report']}", axis=1)
    return Dataset.from_dict({"text": conversations.tolist()})


def tokenize_function(example, tokenizer):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

In [7]:
# Train the model (check this:)

#https://huggingface.co/docs/trl/sft_trainer
#https://huggingface.co/docs/datasets/loading
#https://medium.com/@prashanth.ramanathan/fine-tuning-a-pre-trained-gpt-2-model-and-performing-inference-a-hands-on-guide-57c097a3b810

def get_latest_checkpoint(output_dir):
    # Look for checkpoint folders like "checkpoint-500", "checkpoint-1000", etc.
    checkpoints = glob.glob(os.path.join(output_dir, "checkpoint-*"))
    if not checkpoints:
        return None
    # Sort by number and return latest
    checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]))
    return checkpoints[-1]
    
def train_model(csv_path, output_dir, model_name="openai-community/gpt2", epochs=3):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    latest_checkpoint = get_latest_checkpoint(output_dir)
    model = GPT2LMHeadModel.from_pretrained(latest_checkpoint if latest_checkpoint else model_name)

    dataset = load_csv_dataset(csv_path)
    tokenized_dataset = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=2,
        num_train_epochs=epochs,
        save_total_limit=1,
        logging_steps=10,
        save_steps=500,
        fp16=True,
        report_to="none"
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    trainer.train(resume_from_checkpoint=latest_checkpoint if latest_checkpoint else None)
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)



In [None]:
import os
import glob

#csv_path = 'dream_token.csv'
#output_dir = 'trained_model_v1'
#train_model(csv_path, output_dir)

# De aqui en adelante emails


In [8]:

import os
import glob

def load_csv_dataset(csv_path):
    df = pd.read_csv(csv_path)
    if 'file' not in df.columns or 'message' not in df.columns:
        raise ValueError("CSV must contain 'prompt' and 'result' columns.")

    conversations = df.apply(lambda row: f"User: {row['file']}\nBot: {row['message']}", axis=1)
    return Dataset.from_dict({"text": conversations.tolist()})


def tokenize_function(example, tokenizer):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)


csv_path = 'emails_token.csv'
output_dir = 'trained_model_v2'
train_model(csv_path, output_dir)

Map:   0%|          | 0/517401 [00:00<?, ? examples/s]

  trainer = Trainer(
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Step,Training Loss
