### Read chat.db into a csv

In [12]:
import sqlite3
import pandas as pd

# Connect to the chat.db file
conn = sqlite3.connect('../data/chat.db')

# Execute a query to fetch the data
cursor = conn.cursor()
cursor.execute('''
    SELECT chat.ROWID, message.date, handle.id, chat.room_name, message.text, message.is_from_me
    FROM
    (message INNER JOIN chat_message_join ON message.ROWID = chat_message_join.message_id
                INNER JOIN chat ON chat.ROWID = chat_message_join.chat_id
                    INNER JOIN chat_handle_join ON chat.ROWID = chat_handle_join.chat_id
                        INNER JOIN handle ON handle.ROWID = chat_handle_join.handle_id)
    ORDER BY chat.ROWID, message.date
''')

# Fetch all the rows
rows = cursor.fetchall()

# Create a DataFrame from the fetched rows
df = pd.DataFrame(rows, columns=['ROWID', 'date', 'handle_id', 'room_name', 'text', 'is_from_me'])

df_with_room_name = df[df['room_name'].notna()]
df_without_room_name = df[df['room_name'].isna()]

df_grouped = df_with_room_name.groupby(['text', 'room_name', 'ROWID', 'date', 'is_from_me'], as_index=False).agg({'handle_id': ', '.join})

df_final = pd.concat([df_grouped, df_without_room_name])

# Sort the final DataFrame
df_sorted = df_final.sort_values(['ROWID', 'date'])

# Save the DataFrame as a CSV file
df_sorted.to_csv('../data/chat_grouped.csv', index=False)

# Close the connection
conn.close()

# output looks like this: text, room_name, ROWID, date, is_from_me, handle_id

### CSV -> JSON for OpenAI Formatting

In [19]:
# Get the index of the filtered dataframe
index = df[df['is_from_me'] == 1].index

# Get the index of all messages right before the indices where 'is_from_me' is 1
preceding_index = index - 1

# Handle the case where index is 0
preceding_index = [i if i >= 0 else 0 for i in preceding_index]

# Get the rows corresponding to 'index'
df_index = df.loc[index]

# Get the rows corresponding to 'preceding_index'
df_preceding_index = df.loc[preceding_index]

# Concatenate the two DataFrames
df_final = pd.concat([df_preceding_index, df_index])

# Save the final DataFrame as a CSV file
df_final.to_csv('../data/chat_ready.csv', index=False)

In [26]:
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Get the SYS_PROMPT from the environment variables
sys_prompt = os.getenv('SYS_PROMPT')

You are N, a 20 year old college student at Duke University. Respond to these texts in the diction and phrasing of a college student (casual). Be nice and concise using texting language. I usually text like this: if someone says 'hey', i'll say 'what's up'


In [28]:
import json

# Initialize an empty list to store the conversations
conversations = []

# Iterate over the DataFrame
for i in range(0, len(df_final), 2):
    # Initialize an empty list to store the messages for this conversation
    conversation = []

    # Get the two messages
    message_1 = df_final.iloc[i]
    message_2 = df_final.iloc[i+1]

    # Add the messages to the conversation
    conversation.append({"role": "system", "content": sys_prompt})
    conversation.append({"role": "user", "content": message_1['text']})
    conversation.append({"role": "assistant", "content": message_2['text']})

    # Add the conversation to the conversations
    conversations.append(conversation)

# Convert the conversations to JSONL
with open('../data/conversations.jsonl', 'w') as f:
    for conversation in conversations:
        f.write(json.dumps(conversation) + '\n')

### Send to OpenAI for finetuning!

In [None]:
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

import os

# Get the OpenAI key
openai_api_key = os.getenv('OPENAI_KEY')

In [None]:
from openai import OpenAI

api = OpenAI(api_key=openai_api_key)
response = api.createFineTuning(
    model="text-davinci-003",
    dataset="your_dataset_id",
    stop={"stop": ["\n", "<|endoftext|>"]},
    max_tokens=2048,
    n_examples=1000,
    n_epochs=3,
    n_batch=1
)