# train your own chatbot :P

this is a script that will walk you through training a chatbot on your imessage data. feel free to tune params / use a different model to customize to your needs

- start by running this command in your terminal. it'll dump all of your imessage data stored locally to a csv file called messages.csv we can use for training!!

In [None]:
sqlite3 -header -csv ~/Library/Messages/chat.db \
"SELECT
  m.ROWID AS message_id,
  c.chat_identifier AS contact,
  datetime(m.date/1000000000 + strftime('%s','2001-01-01'), 'unixepoch') AS timestamp,
  m.is_from_me,
  m.text
FROM message m
JOIN chat_message_join cmj ON m.ROWID = cmj.message_id
JOIN chat c ON cmj.chat_id = c.ROWID
WHERE m.text IS NOT NULL
ORDER BY m.date ASC;" > ~/Desktop/imessages.csv


importing all of our packages 😎

In [4]:
import pandas as pd
import unicodedata
import json
import re
import json
import torch
from torch.optim import AdamW
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi, HfFolder, Repository, upload_folder
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

mount your google drive!!! colab let's you use a T4 GPU which will make this go a lot faster

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## cleaning time!!!!
imessage data is messy so we're gonna remove anything that could prevent our model from learning patterns - puncuation, emojis, etc.
we're also going to make sure duplicate message pairs aren't added to our training set

In [7]:
## cleaning the text to free from weird puncuation, link prefixs, etc.
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)
    text = re.sub(r"[‘’´`ʼ]", "", text)
    text = text.replace("'", "")
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

## formatting - only including messages longer than 5 characters, and making sure there aren't any duplicates
def format_text(input_filename, output_filename):
    df = pd.read_csv(f"{input_filename}.csv")
    df['text'] = df['text'].apply(clean_text)
    df = df[df['text'].str.len() > 5]
    pairs = []
    seen_responses = set()

    for i in range(len(df) - 1):
        curr = df.iloc[i]
        nxt = df.iloc[i + 1]
        if (curr["is_from_me"] == 0 and nxt["is_from_me"] == 1) and (curr["contact"] == nxt["contact"]):
            response = nxt["text"].strip()
            if response not in seen_responses:
                pairs.append({
                    "instruction": curr["text"].strip(),
                    "response": response
                })
                seen_responses.add(response)

    print(f"Found {len(pairs)} unique instruction-response pairs")
    with open(output_filename, "w") as f:
        for p in pairs:
          # writing everything to a brand new JSON file
            f.write(json.dumps(p) + "\n")

In [20]:
df = pd.read_csv('/content/imessages.csv')
df.shape

(98073, 5)

In [21]:
## optional - only sampling the last 50k messages (most recent & accurate)
df = df.tail(50000)
df.to_csv('/content/sample_imessages.csv', index=False)

apply the functions!!

In [22]:
input_csv = "sample_imessages"
output_jsonl = "train.json"
format_text(input_csv, output_jsonl)
print(f"saved cleaned message pairs to: {output_jsonl}")

Found 4639 unique instruction-response pairs
saved cleaned message pairs to: train.json


check it outtt

In [23]:
df = pd.read_json('/content/train.json', lines=True)
df.shape

Unnamed: 0,instruction,response
0,Are u going to that pike thing this weekend,the wedding??
1,a darty?,no remember last year when everyone went to jo...
2,i remember it but like not who was there,i remember it looked like so much fun and i wa...
3,Is it fun or weird,ok i had no idea about it this year but last y...
4,Oh rrly,they called it like chapman coachella


In [25]:
# now we load and prepare the data with some more formatting (yawn)
def load_and_prepare_data(json_path):
    data = []
    with open(json_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    cleaned_data = []
    for item in data:
        # skip empty / very short messages
        if len(item['instruction'].strip()) < 3 or len(item['response'].strip()) < 3:
            continue
        instruction = item['instruction'].strip()
        response = item['response'].strip()
        # skip if too long
        if len(instruction) + len(response) > 400:  # 400 limit - feel free to djust
            continue
        # instruction response pairs help the model learn what it's response should be
        cleaned_data.append({
            'instruction': instruction,
            'response': response
        })

    return cleaned_data

In [26]:
# tokenizer function optimized for DialoGPT conversation format
def dialogpt_tokenizer_function(tokenizer, max_length=512):
    def tokenize_function(examples):
        conversations = []
        # using our instruction response samples form before
        for instruction, response in zip(examples['instruction'], examples['response']):
            # dialoGPT format: <|endoftext|>user_input<|endoftext|>bot_response<|endoftext|>
            conversation = f"{tokenizer.eos_token}{instruction}{tokenizer.eos_token}{response}{tokenizer.eos_token}"
            conversations.append(conversation)

        # tokenize!!!
        tokenized = tokenizer(
            conversations,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        # labels are the same as input_ids for causal language models
        tokenized["labels"] = tokenized["input_ids"].clone()

        return tokenized

    return tokenize_function

In [27]:
# using LORA configuration for DialoGPT
def setup_lora_model(model_name, device="cuda"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.eos_token_id
    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=[
            "c_attn",  # Self-attention
            "c_proj",  # Projection layer
            "c_fc",    # Feed-forward
        ],
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False
    )
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    return model, tokenizer

In [28]:
class DialogueDataCollator:
    # data collator to handle conversation formatting
    def __init__(self, tokenizer, mlm=False):
        self.tokenizer = tokenizer
        self.mlm = mlm

    def __call__(self, examples):
        batch = {}
        max_length = max(len(ex['input_ids']) for ex in examples)

        input_ids = []
        attention_mask = []
        labels = []
        for example in examples:
            input_id = example['input_ids']
            mask = example['attention_mask']
            label = example['labels']

            padding_length = max_length - len(input_id)
            if padding_length > 0:
                input_id = input_id + [self.tokenizer.pad_token_id] * padding_length
                mask = mask + [0] * padding_length
                label = label + [-100] * padding_length

            input_ids.append(input_id)
            attention_mask.append(mask)
            labels.append(label)

        batch['input_ids'] = torch.tensor(input_ids, dtype=torch.long)
        batch['attention_mask'] = torch.tensor(attention_mask, dtype=torch.long)
        batch['labels'] = torch.tensor(labels, dtype=torch.long)

        return batch


In [29]:
# training time!!! this is a pipeline that will go through each step
def train_dialogpt_model(data_path, model_name="microsoft/DialoGPT-medium", output_dir="./dialogpt-finetuned"):

    # load
    print("Loading and preparing data...")
    data = load_and_prepare_data(data_path)
    print(f"Loaded {len(data)} examples")

    # split
    train_data, eval_data = train_test_split(data, test_size=0.1, random_state=42)

    # make dataset
    train_dataset = Dataset.from_list(train_data)
    eval_dataset = Dataset.from_list(eval_data)

    # model + tokenizer
    model, tokenizer = setup_lora_model(model_name)

    # tokenization function
    tokenize_fn = dialogpt_tokenizer_function(tokenizer, max_length=512)

    # tokenize datasets
    train_dataset = train_dataset.map(
        tokenize_fn,
        batched=True,
        remove_columns=train_dataset.column_names,
        desc="Tokenizing train dataset"
    )

    eval_dataset = eval_dataset.map(
        tokenize_fn,
        batched=True,
        remove_columns=eval_dataset.column_names,
        desc="Tokenizing eval dataset"
    )

    # Data collator
    data_collator = DialogueDataCollator(tokenizer)

    # training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2,
        num_train_epochs=10,
        learning_rate=1e-4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_steps=50,
        save_steps=500,
        eval_steps=500,
        eval_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_total_limit=3,
        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=42,
        do_eval=True
    )

    # dataset is large training can take 4ever - implemnent early stopping
    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=3,  # stops training if there's no improvement for 3 steps
        early_stopping_threshold=0.008
    )

    # trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[early_stopping_callback]
    )

    # train!!!!
    print("Starting training...")
    trainer.train()

    # save to output directory
    trainer.save_model(f"{output_dir}/final")
    tokenizer.save_pretrained(f"{output_dir}/final")

    print(f"Training completed! Model saved to {output_dir}/final")

    return model, tokenizer


In [30]:
# run itttttttt
if __name__ == "__main__":
    model, tokenizer = train_dialogpt_model(
        data_path= "/content/train.json",
        model_name= "microsoft/DialoGPT-medium",
        output_dir="/content/drive/MyDrive/dialogpt-imessage-finetuned" # save to google drive!!!!
    )

Loading and preparing data...
Loaded 4621 examples


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/863M [00:00<?, ?B/s]



trainable params: 6,291,456 || all params: 361,114,624 || trainable%: 1.7422


Tokenizing train dataset:   0%|          | 0/4158 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/463 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
500,0.1953,0.177862
1000,0.1604,0.155765
1500,0.1463,0.147665
2000,0.1376,0.1446
2500,0.1456,0.142943
3000,0.1427,0.142207


Training completed! Model saved to /content/drive/MyDrive/dialogpt-imessage-finetuned/final


In [31]:
# load it back up for testing
model_path = "/content/drive/MyDrive/dialogpt-imessage-finetuned/final"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)

In [32]:
# test function to generate a response!!
def generate_response(model, tokenizer, user_input, max_length=200, temperature=0.8):
    """Generate response using the fine-tuned model"""
    # formatted for DialoGPT input
    formatted_input = f"{tokenizer.eos_token}{user_input}{tokenizer.eos_token}"
    inputs = tokenizer.encode(formatted_input, return_tensors="pt")
    input_length = inputs.shape[1]

    # generate!!!!
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            temperature=temperature,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            num_return_sequences=1
        )

    # exclude input & only decode the response
    new_tokens = outputs[0][input_length:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)

    return response.strip()

In [50]:
# test with an input :)
test_input = "what are you wearing"
response = generate_response(model, tokenizer, test_input)
print(f"Input: {test_input}")
print(f"Response: {response}")

Input: what are you wearing
Response: i dont know i just woke up


In [None]:
# push your model to huggingface!!!!
upload_folder(
    repo_id="paigecaskey/chatbot-me",
    folder_path="./diablogpt-imessage",
    repo_type="model"
)

woooooo all done!!!