In [42]:
import os
import re
import time
import numpy as np

from ollama import chat
from sentence_transformers import SentenceTransformer

In [43]:
def chat_with_model(prompt, sysprompt=None, model='llama3.2', get_tok_sec=False):
    start_time = time.time()
    messages = []
    if sysprompt:
        messages.append({'role': 'system', 'content': sysprompt})
    messages.append({'role': 'user', 'content': prompt})
    
    response = chat(
        model=model,
        messages=messages
    )
    end_time = time.time()

    message = response.message.content
    generation_time = end_time - start_time
    input_length = response.prompt_eval_count
    if get_tok_sec:
        token_count = response.eval_count
        tokens_per_second = token_count / generation_time if generation_time > 0 else 0
        tokens_per_second = round(tokens_per_second, 2)
    else:
        tokens_per_second = None

    return {
        'message': message,
        'input_length': input_length,
        'gen_time_sec': round(generation_time, 2),
        'tok_sec': tokens_per_second
    }

def load_and_split_text(filename, chunk_size=5000):
    file_path = os.path.join('texts', 'keeps', filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    text = text.replace('_', '')
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]    
    return chunks

In [44]:
# prompts = [
#     "How are you today?",
# ]
# system_prompt = "You are a professor of philsophy who is exceptional at summarizing longer works into the underlying philosophical themes. When you summarize, you describe in a clear and concise manner that fully conveys the argument of the work to other philosophers. Your responses are not for average person, but are incredibly clear to the philosophical expert."
# # Can choose between 'llama3.2' (~50 tok/s) and 'llama3.2:1b' (~90 tok/s)
# model = 'llama3.2:1b'

# speeds = []
# for prompt in prompts:
#     response = chat_with_model(prompt, sysprompt=system_prompt, model=model, get_tok_sec=True)
#     print(f"{prompt}\n{'='*100}\n{response['message']}\n\n\n")
#     speeds.append(response['tok_sec'])

## Cleaning Up Online Texts

In [45]:
emb_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = emb_model.encode(["Your longer text here"])

In [48]:
files = [
    'montaigne_ofexperience.txt'
]

system_prompt = "You are an expert at cleaning text files, and I will be giving you partial text from Montaigne's essay 'Of Experience'. For your task, you must modernize the spelling. You must also remove any non-english, as he likes to write in Latin at times. Other than making these two changes, do not change anything else. Do not add anything that was not previously there. Do not say anything else. Do not tell me you cleaned it. Do not ask if I need anything else."
# Can choose between 'llama3.2' (~50 tok/s) and 'llama3.2:1b' (~90 tok/s)
model = 'llama3.2:1b'
total_refusals = 0

for file in files:
    text_chunks = load_and_split_text(file, chunk_size=1000)
    cleaned_chunks = []
    lengths, speeds = [], []
    c, refusals = 0, 0
    for i, chunk in enumerate(text_chunks):
        og_emb = emb_model.encode(chunk)
        for _ in range(5):
            response = chat_with_model(chunk, sysprompt=system_prompt, model=model, get_tok_sec=True)
            response_emb = emb_model.encode(response['message'])
            if og_emb @ response_emb.T > 0.85:
                break
            else:
                refusals += 1
            response = {'message': chunk, 'tok_sec': 0}

        cleaned_chunks.append(response['message'])
        lengths.append(len(response['message']))
        speeds.append(response['tok_sec'])
        if (i+1) % 5 == 0:
            print(f"[{i+1:>4}/{len(text_chunks):>4}]: Avg. Length: {sum(lengths)/len(lengths):.2f} | Avg. Speed: {sum(speeds)/len(speeds):.2f} tok/s | Refusals: {refusals}")
            total_refusals += refusals
            lengths, speeds, refusals = [], [], 0    
            c += 1
            if c > 5:
                break
    break

print(f"Cleaned {len(cleaned_chunks)} inputs and saw {total_refusals} refusals")

[   5/ 148]: Avg. Length: 990.60 | Avg. Speed: 89.82 tok/s | Refusals: 0
[  10/ 148]: Avg. Length: 971.60 | Avg. Speed: 91.14 tok/s | Refusals: 3
[  15/ 148]: Avg. Length: 1014.80 | Avg. Speed: 90.77 tok/s | Refusals: 1
[  20/ 148]: Avg. Length: 979.60 | Avg. Speed: 69.86 tok/s | Refusals: 6
[  25/ 148]: Avg. Length: 989.00 | Avg. Speed: 70.20 tok/s | Refusals: 5
[  30/ 148]: Avg. Length: 874.60 | Avg. Speed: 90.08 tok/s | Refusals: 2
Cleaned 30 inputs and saw 17 refusals


In [51]:
i = 3
print(text_chunks[i])
print('-'*100)
print(cleaned_chunks[i])

 with one, to which hee may so exactly joyne and match it, but some circumstance and diversitie will remaine, that may require a diverse consideration of judgement. There is but little relation betweene our actions, that are in perpetuall mutation, and the fixed and unmooveable lawes. The most to be desired, are the rarest, the simplest and most generall. And yet I believe, it were better to have none at all, then so infinite a number as we have. Nature gives them ever more happy, then those we give our selves. Witnesse the image of the golden age that Poets faine; and the state wherein we see diverse nations to live, which have no other. Some there are, who to decide any controversie, that may rise amongest them, will chuse for judge the first man that by chance shall travell alongst their mountaines: Others, that upon a market day will name some one amongst themselves, who in the place without more wrangling shall determine all their questions. What danger would ensue, if the wisest 