# Preprocessing PDFS to Extract Text

In [2]:
import csv
import os
import re
from itertools import zip_longest
from PIL import Image
Image.MAX_IMAGE_PIXELS = None
from pdf2image import convert_from_path
import pytesseract
import math
import pandas as pd

In [3]:
#Input Needed:
pdf_path = 'FriendMessage.pdf'
output_path = 'FriendMessage.csv'
total_pages = 29
pages_per_chunk = 5

#scan pdf images, and separate Left side for prompts, and right side for completions
def convert_pdf_to_images(pdf_path, start_page, end_page):
    return convert_from_path(pdf_path, dpi=200, first_page=start_page, last_page=end_page)

def ocr_core(image):
    text = pytesseract.image_to_string(image)
    return text

def split_into_blocks(text):
    blocks = re.split('\n\s*\n', text)
    blocks = [block.strip() for block in blocks]
    blocks = [block for block in blocks if block]

    merged_blocks = []
    current_block = []
    for block in blocks:
        if block.endswith('\n'):
            current_block.append(block)
        else:
            current_block.append(block)
            merged_blocks.append(' '.join(current_block))
            current_block = []
    if current_block:
        merged_blocks.append(' '.join(current_block))

    return merged_blocks

def filter_text_blocks(left_blocks, right_blocks, patterns):
    new_left_blocks = []
    new_right_blocks = []
    seen_left_blocks = set()  # a set to keep track of the seen left blocks
    for i, right_block in enumerate(right_blocks):
        if right_block and not any(re.search(pattern, right_block) for pattern in patterns):
            if i < len(left_blocks):  # to ensure we don't go out of index
                left_block = left_blocks[i]
                if left_block and not any(re.search(pattern, left_block) for pattern in patterns):  # only if there's text on the left
                    if left_block not in seen_left_blocks:  # only if left block has not been seen before
                        new_left_blocks.append(left_block)
                        new_right_blocks.append(right_block)
                        seen_left_blocks.add(left_block)  # add the seen left block to the set
    return new_left_blocks, new_right_blocks


num_chunks = math.ceil(total_pages / pages_per_chunk)
texts = []

skip_patterns = [
    r'JPEG|(?:__)[A-Z0-9]+-[A-Z0-9]+-[A-Z0-9]+.jpeg|[0-9]{7,}|IMG_[0-9]{4}.png|PNG Image - [0-9].[0-9] MB|PTV eX-%\) ica\) JPEG Image - [0-9]{2} KB',
    'Sent as Text Message',
    r'\b[a-z0-9-]+(\.[a-z0-9-]+)+\b',
    r'@\)']

for chunk in range(num_chunks):
    start_page = chunk * pages_per_chunk + 1
    end_page = min((chunk + 1) * pages_per_chunk, total_pages)
    images = convert_pdf_to_images(pdf_path, start_page, end_page)

    for image in images:
        width, height = image.size
        left_image = image.crop((0, 0, 0.45*width, height))
        right_image = image.crop((0.55*width, 0, width, height))

        left_text = ocr_core(left_image)
        right_text = ocr_core(right_image)

        left_blocks = split_into_blocks(left_text)
        right_blocks = split_into_blocks(right_text)

        filtered_left_blocks, filtered_right_blocks = filter_text_blocks(left_blocks, right_blocks, skip_patterns)
        texts.extend(zip_longest(filtered_left_blocks, filtered_right_blocks, fillvalue=''))

with open(output_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['prompt', 'completion'])
    writer.writerows(texts)

# Fine-Tuning

In [4]:
import openai
import pandas as pd
import string

In [5]:
#paste path to original excel data in Google Drive
file_path = 'FriendMessage.csv'
df = pd.read_csv(file_path)

In [6]:
df

Unnamed: 0,prompt,completion
0,I don't want to\r\nI want to sleep but I don’t...,Not if u don’t want to
1,Does that make sense,I getit
2,I have 1 hour till movie over,But it’s good to plan your sleep
3,O wait\r\nI meant to say u,Me too
4,Not I,Oh m gee
...,...,...
373,_ I like a squeezable Mansi,I used to be a bad bitch
374,_ This one,no
375,It's hot 2 me,Hot Mansi better Mansi
376,_ Go sleepy weepy,No its not


In [7]:
common_suffix_separator = "\n\n###\n\n"
df["prompt"] = df["prompt"].apply(lambda x: x.strip() + common_suffix_separator)

In [8]:
common_suffix_ending = "###END###"
df["completion"] = df["completion"].apply(lambda x: x.strip() + common_suffix_ending)


In [9]:
df["completion"] = df["completion"].apply(lambda x: " " + x.strip())

In [10]:
#name your cleaned up excel file (recommend pasting the same filepath, and adding a "p" for pandas)
df.to_csv('Text-Message-Data-p.csv', index=False)

In [11]:
import csv
import json

# Define input and output file paths
input_csv_file = 'Text-Message-Data-p.csv'
output_jsonl_file = 'Text-Message-Data-p_prepared.jsonl'

# Read CSV file and convert to JSONL
with open(input_csv_file, 'r', newline='') as csvfile, open(output_jsonl_file, 'w') as jsonlfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Convert row to JSON
        json_data = json.dumps(row)
        
        # Write JSON data to JSONL file
        jsonlfile.write(json_data + '\n')

print("CSV file converted to JSONL format.")


CSV file converted to JSONL format.


## OpenAI Python client library to create a fine-tune file for training a model

In [168]:
from openai import OpenAI

client = OpenAI(api_key = "put your api key here")

In [94]:
#Create fine-tune file and put in Google Drive
def create_training_file(file_path):
  file = client.files.create(
      file=open(file_path, "rb"),
      purpose="fine-tune"
  )
  return file

#copy training file path above, and paste here, ending in .jsonl
training_file = create_training_file("Text-Message-Data-p_prepared.jsonl")
print(training_file)

FileObject(id='file-HuNy1CGpsY6xavmSkMYxjwuK', bytes=38592, created_at=1710283430, filename='Text-Message-Data-p_prepared.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


In [97]:
training_file.id

'file-HuNy1CGpsY6xavmSkMYxjwuK'

In [111]:
fine_tuned_model = client.fine_tuning.jobs.create(training_file=training_file.id,
                    model ='davinci-002',
                    hyperparameters={
                    "n_epochs": 5
                    }
                )

In [113]:
print(fine_tuned_model)

FineTuningJob(id='ftjob-eZpWj6pQi12gd1QWKSOqMus7', created_at=1710284365, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=5, batch_size='auto', learning_rate_multiplier='auto'), model='davinci-002', object='fine_tuning.job', organization_id='org-kCgMIHV36ylNWQhRzwaOXvkd', result_files=[], status='validating_files', trained_tokens=None, training_file='file-HuNy1CGpsY6xavmSkMYxjwuK', validation_file=None, user_provided_suffix=None)


In [115]:
job_id = fine_tuned_model.id
status = fine_tuned_model.status

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {fine_tuned_model}")
print(f"Training Status: {status}")

Fine-tunning model with jobID: ftjob-eZpWj6pQi12gd1QWKSOqMus7.
Training Response: FineTuningJob(id='ftjob-eZpWj6pQi12gd1QWKSOqMus7', created_at=1710284365, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=5, batch_size='auto', learning_rate_multiplier='auto'), model='davinci-002', object='fine_tuning.job', organization_id='org-kCgMIHV36ylNWQhRzwaOXvkd', result_files=[], status='validating_files', trained_tokens=None, training_file='file-HuNy1CGpsY6xavmSkMYxjwuK', validation_file=None, user_provided_suffix=None)
Training Status: validating_files


In [139]:
import signal
import datetime


def signal_handler(sig, frame):
    status = client.fine_tuning.jobs.retrieve(job_id).status
    print(f"Stream interrupted. Job is still {status}.")
    return


print(f"Streaming events for the fine-tuning job: {job_id}")

signal.signal(signal.SIGINT, signal_handler)

events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id)
try:
    for event in events:
        print(
            f'{datetime.datetime.fromtimestamp(event.created_at)} {event.message}'
        )
except Exception:
    print("Stream interrupted (client disconnected).")

Streaming events for the fine-tuning job: ftjob-eZpWj6pQi12gd1QWKSOqMus7
2024-03-12 16:09:34 The job has successfully completed
2024-03-12 16:09:31 New fine-tuned model created: ft:davinci-002:personal::925XP6lM
2024-03-12 16:09:02 Step 1801/1890: training loss=1.79
2024-03-12 16:08:35 Step 1701/1890: training loss=0.50
2024-03-12 16:08:03 Step 1601/1890: training loss=1.30
2024-03-12 16:07:35 Step 1501/1890: training loss=2.29
2024-03-12 16:07:05 Step 1401/1890: training loss=1.29
2024-03-12 16:06:38 Step 1301/1890: training loss=3.20
2024-03-12 16:06:10 Step 1201/1890: training loss=3.05
2024-03-12 16:05:43 Step 1101/1890: training loss=3.67
2024-03-12 16:05:15 Step 1001/1890: training loss=2.82
2024-03-12 16:04:48 Step 901/1890: training loss=0.90
2024-03-12 16:04:18 Step 801/1890: training loss=3.09
2024-03-12 16:03:51 Step 701/1890: training loss=5.05
2024-03-12 16:03:23 Step 601/1890: training loss=3.39
2024-03-12 16:02:56 Step 501/1890: training loss=1.16
2024-03-12 16:02:28 Ste

## Interacting with Fine-tuned Model

#### Example 1

In [38]:
from openai import OpenAI

In [196]:
new_prompt = "What do you want to eat?"
answer = client.completions.create(
  model="ft:davinci-002:personal::925XP6lM",
  prompt=new_prompt
)
print("Harry:", new_prompt)
print("Mansi:",answer.choices[0].text)

Harry: What do you want to eat?
Mansi:  Either we can get pizza or like, what about your tacos?" Or something like


In [197]:
print("Harry: What do you want to eat?")
print("Mansi:  Either we can get pizza or like, what about your tacos?")

Harry: What do you want to eat?
Mansi:  Either we can get pizza or like, what about your tacos?


#### Example 2

In [260]:
new_prompt = "What do you think about politics? And what's your views or is it just not for you?"
answer = client.completions.create(
  model="ft:davinci-002:personal::925XP6lM",
  prompt=new_prompt
)
print("Harry:", new_prompt)
print("Mansi:",answer.choices[0].text)

Harry: What do you think about politics? And what's your views or is it just not for you?
Mansi:  Not interested?

It's something I stay away from.

Why? It's very


In [262]:
print("Harry: What do you think about politics? And what's your views or is it just not for you?")
print("Mansi: Not interested? It's something I stay away from.")

Harry: What do you think about politics? And what's your views or is it just not for you?
Mansi: Not interested? It's something I stay away from.


#### Example 3

In [281]:
new_prompt = "Whats your favourite kind of music?"
answer = client.completions.create(
  model="ft:davinci-002:personal::925XP6lM",
  prompt=new_prompt
)
print("Harry:", new_prompt)
print("Mansi:",answer.choices[0].text)

Harry: Whats your favourite kind of music?
Mansi:  Mine is alternative - https://www.youtube.com/watch?v=MOEdFPa


In [283]:
print("Harry: Whats your favourite kind of music?")
print("Mansi: Mine is alternative.")

Harry: Whats your favourite kind of music?
Mansi: Mine is alternative.
