In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

model_name = "microsoft/phi-3-mini-4k-instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)

  from .autonotebook import tqdm as notebook_tqdm
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:15<00:00,  7.94s/it]


In [2]:
model = PeftModel.from_pretrained(base_model, "./humaniser_lora")
model.eval()

print("Loaded LoRA model (not merged)")

Loaded LoRA model (not merged)


In [27]:
# %%
import re

def humanise(text: str, max_sentences=3) -> str:
    prompt = f"""### Neutral:
{text}
### Humanised:
"""
    
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,  # Increased from 30
            do_sample=True,
            temperature=0.4,
            top_p=0.85,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            use_cache=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )
    
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "### Humanised:" in decoded:
        result = decoded.split("### Humanised:", 1)[1].strip()
        result = result.split("###")[0].strip()
        
        # Take up to max_sentences instead of just first sentence
        sentences = result.split(".")
        result = ".".join(sentences[:max_sentences]).strip()
        
        # Remove emojis
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"
            u"\U0001F300-\U0001F5FF"
            u"\U0001F680-\U0001F6FF"
            u"\U0001F1E0-\U0001F1FF"
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE)
        result = emoji_pattern.sub('', result).strip()
        
        # Replace em dashes with semicolons
        result = result.replace('—',' - ')
        
        if result and not result.endswith('.'):
            result += "."
        return result
    
    return decoded.strip()

In [4]:
print(humanise("You still have the trained model in memory from training. You need to restart your kernel first to clear GPU memory."))

You are not running the flash-attention implementation, expect numerical differences.


We’re not resetting our system—we just keep that well-trained brain we built up and stored into RAM, waiting for us when it comes time to use him/her). First you should run “Kernel > Restart &


In [5]:
# Test sentences
test_sentences = [
    
    "The weather forecast indicates rain throughout the weekend.",
    "You may experience minor delays during peak hours."
]

# Test them
for sentence in test_sentences[:5]:  # Test first 5

    print(f"{humanise(sentence)}")

We’re going to have a rainy weekend according to my phone app.
There might be slight waiting at rush hour, though.


In [13]:
random_tests = [
    "The package has arrived.",
    "I received your message.",
    "That is interesting.",
    "The meeting was cancelled.",
    "I will be there soon.",
    "This is correct.",
    "The price increased.",
    "That makes sense.",
    "I need more time.",
    "The update is ready."
]

for test in random_tests:
    print(f"Input: {test}")
    print(f"Output: {humanise(test)}")
    print("---")

Input: The package has arrived.
Output: It’s here—the box came today!.
---
Input: I received your message.
Output: Sent you a DM lol.
---
Input: That is interesting.
Output: Really? That’s cool!.
---
Input: The meeting was cancelled.
Output: We’re not having the call then.
---
Input: I will be there soon.
Output: Nah, I’ll see you in a bit then.
---
Input: This is correct.
Output: Yep, you’re right.
---
Input: The price increased.
Output: It went up in price though… I guess we’ll see how long it lasts now that the prices have gone back down :/.
---
Input: That makes sense.
Output: Makes perfect sense, thanks!.
---
Input: I need more time.
Output: Give me a minute, please.
---
Input: The update is ready.
Output: Update coming up now!.
---


In [6]:
print(humanise("He paused for a moment, rereading the message, and then smiled slightly as he realized how carefully the words had been chosen."))

For one heartbeat I thought it was him laughing at me—re-reading that beautifully written line in his voice made my knees shake with laughter. He smiles like this every time; you should see us together som
