# Setup
Set directory locations and load python settings. Common imports, defaults etc.

In [1]:
import sys
import pandas as pd
import re

BASE_DIR = ".."

%run "$BASE_DIR/settings.py"

# %reload_ext autoreload
# %autoreload 2
# %config InlineBackend.figure_format = 'png'

# to print output of all statements and not just the last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



# path to import blueprints packages
sys.path.append(BASE_DIR + '/packages')

Get all distinct channels from the legacy logs

In [2]:
from nounsai.db import execute_query

get_logs_query = f"""SELECT author, content, created_timestamp, discord_message_id, discord_channel_id
                             FROM legacy_chatlogs
                             ORDER BY created_timestamp"""
all_logs = execute_query(get_logs_query)
channel_logs_df = pd.DataFrame(all_logs)

Clean up content into a new column named `clean_content`

In [3]:
from nounsai.preparation import replace_empty_strings, clean_content, strip_emojis, prepare

pipeline = [strip_emojis, replace_empty_strings, clean_content]

channel_logs_df['clean_content'] = channel_logs_df['content'].progress_apply(prepare, pipeline=pipeline)
channel_logs_df.dropna(subset=['clean_content'], inplace=True)

  0%|          | 0/148304 [00:00<?, ?it/s]

In [4]:
def format_log(log):
    """Formats a log message as a string with the format MMDD@HH:MM USER: CONTENT."""
    month = str(log['created_timestamp'].month).zfill(2)
    day = str(log['created_timestamp'].day).zfill(2)
    year = str(log['created_timestamp'].year)
    hour = str(log['created_timestamp'].hour).zfill(2)
    minute = str(log['created_timestamp'].minute).zfill(2)

    user = log['author'].split('#')[0]

    # Format the log line with the month, day, hour, minute, author, and content
    log_line = f"{month}-{day}-{year}@{hour}:{minute} {user}: {log['clean_content']}"
    return log_line

In [5]:
from nounsai.utils import num_tokens_from_string

channel_id = '877215225093451776'

# Select the rows for the channel of interest
channel_logs = channel_logs_df[channel_logs_df['discord_channel_id'] == channel_id]

# Group the rows by channel ID and sort by created timestamp
grouped_logs = channel_logs.groupby('discord_channel_id').apply(lambda x: x.sort_values('created_timestamp'))

clean_logs = []

# Iterate through each row of the grouped logs
for index, row in grouped_logs.iterrows():
    # Do something with the row data
    log_line = format_log(row)
    clean_logs.append(log_line)

Test with LangChain's TokenTextSplitter



In [6]:
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=3000, chunk_overlap=100)
#text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=3700, chunk_overlap=50)
texts = text_splitter.split_text('\n'.join(clean_logs))

In [7]:
def find_first_and_last_timestamps(chat_log):
    pattern = r"\d{2}-\d{2}-\d{4}@\d{2}:\d{2}"
    matches = re.findall(pattern, chat_log)
    first_timestamp = matches[0]
    last_timestamp = matches[-1]
    return (first_timestamp, last_timestamp)

In [8]:
def open_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as infile:
        return infile.read()

In [9]:

import openai
import os
from time import time,sleep
import re
from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.environ.get('OPENAI_API_KEY')

def gpt3_completion(prompt, engine='text-davinci-003', temp=0.0, top_p=1.0, tokens=250, freq_pen=0.0, pres_pen=0.0, stop=['<<END>>']):
    max_retry = 5
    retry = 0
    while True:
        try:
            response = openai.Completion.create(
                engine=engine,
                prompt=prompt,
                temperature=temp,
                max_tokens=tokens,
                top_p=top_p,
                frequency_penalty=freq_pen,
                presence_penalty=pres_pen,
                stop=stop)
            text = response['choices'][0]['text'].strip()
            text = re.sub('\s+', ' ', text)
            #lines = [re.sub('^-', '', line).strip() for line in text]
            response = '\n'.join(text)
            filename = '%s_gpt3.txt' % time()
            log_path = os.path.join(os.getcwd(), 'gpt3_logs', filename)
            with open(log_path, 'w') as outfile:
                outfile.write('PROMPT:\n\n' + prompt + '\n\n==========\n\nRESPONSE:\n\n' + text)
            return text
        except Exception as oops:
            retry += 1
            if retry >= max_retry:
                raise RuntimeError("GPT3 error: %s" % oops)
                #return "GPT3 error: %s" % oops
            print('Error communicating with OpenAI:', oops)
            sleep(2)

True

In [15]:
from datetime import datetime
df_summaries = pd.DataFrame(columns=['start_date', 'end_date', 'summary','channel'])
df_summaries = df_summaries.set_index(keys=['start_date', 'end_date'], drop=True)

for text in texts:
    start_date, end_date = find_first_and_last_timestamps(text)
    start_date = datetime.strptime(start_date, '%m-%d-%Y@%H:%M').strftime('%Y-%m-%d %H:%M:%S')
    end_date = datetime.strptime(end_date, '%m-%d-%Y@%H:%M').strftime('%Y-%m-%d %H:%M:%S')
    prompt = open_file('chatlog_summary.txt').replace('<<CHAT_LOGS>>', text)
    prompt = prompt.encode(encoding='ASCII',errors='ignore').decode()
    print(f"\nchannel: {channel_id}\nstart_date: {start_date}\nend_date: {end_date}")
    response = gpt3_completion(prompt, temp=0.0, tokens=1000, freq_pen=0.0)
    row = {'start_date': start_date, 'end_date': end_date, 'summary': response, 'channel': channel_id}
    df_summaries = df_summaries.append(row, ignore_index=True)

df_summaries.set_index(['start_date', 'end_date'], inplace=True)



channel: 855538581434007635

start_date: 2021-06-18 16:05:00

end_date: 2021-06-21 14:32:00


KeyboardInterrupt: 

In [14]:
import sqlite3

conn = sqlite3.connect('summaries.db')
df_summaries.to_sql('summaries', conn, if_exists='append')
conn.close()


1

In [11]:
# %%capture
# with open('output.txt', 'w') as f:
#     for log in channel_logs:
#         # Extract the month, day, hour, and minute from the created_timestamp
#         month = str(log['created_timestamp'].month).zfill(2)
#         day = str(log['created_timestamp'].day).zfill(2)
#         hour = str(log['created_timestamp'].hour).zfill(2)
#         minute = str(log['created_timestamp'].minute).zfill(2)

#         user = log['author'].split('#')[0]
#         # Remove newlines and carriage returns from the content
#         content = "\n".join([line.strip() for line in log['content'].split("\n") if line.strip()])

#         # Format the log line with the month, day, hour, minute, author, and content
#         log_line = f"{month}{day}@{hour}:{minute} {user}: {content}"

#         # Write the log line to the file
#         f.write(log_line + '\n')