In [57]:
# Install packages
# !conda install -c conda-forge plyvel


In [58]:
# import readme.md file (project overview) and display the markdown file here

from IPython.display import display, Markdown

with open('README.md', 'r') as f:
    content = f.read()

display(Markdown(content))


# ChatGPT analysis

In this notebook I deliver the foundation to analyse the personal ChatGPT conversations. After a few months of using ChatGPT, I have collected a large amount of data and want to analyse this data to get a better understanding of my own behaviour, reflect on old conversations and spark ideas on how to use it in the future.

Using the Superpower ChatGPT extension for Chrome you can automatically sync your conversations for offline usage. Since all your conversations are now stored locally, you can analyse the local database to get insights into your conversations.

## Introduction


- Superpower ChatGPT extension: https://github.com/saeedezzati/superpower-chatgpt


## Current state

- [x] Get data from local database to a pandas dataframe
    - [x] df_conversations
    - [x] df_messages

- [x] Streamlint Dashboard
    - [x] Conversation overview
    - [x] Message overview
    - [x] Message analysis

## Pull requests


## Import data


### Import libraries

In [59]:
import plyvel
import os
import json
import pandas as pd


### Import data from browser database

In [60]:

# Path to the leveldb directory (change this to your path - in my case I am using Brave Browser)
leveldb_path = '~/Library/Application Support/BraveSoftware/Brave-Browser/Default/Local Extension Settings/amhmeenmapldpjdedekalnfifgnpfnkc'
leveldb_path = os.path.expanduser(leveldb_path)  # Expand the '~' symbol to the user's home directory

# Enclose the path in quotes to handle spaces
leveldb_path = f'"{leveldb_path}"'

# Get the list of files in the leveldb directory

!ls -l $leveldb_path

# Copy all files to the current directory /db

!cp -r $leveldb_path ./db

# set leveldb path to the copied directory

leveldb_path = './db/amhmeenmapldpjdedekalnfifgnpfnkc'


total 106048
-rw-------@ 1 phil  staff   7720330 May 30 17:41 003033.ldb
-rw-------@ 1 phil  staff    131157 Jun  9 16:46 006828.ldb
-rw-------@ 1 phil  staff   7920197 Jun  9 22:35 007012.ldb
-rw-------@ 1 phil  staff    129191 Jun  9 22:35 007013.ldb
-rw-------@ 1 phil  staff   7882321 Jun  9 22:35 007015.ldb
-rw-------@ 1 phil  staff  22339962 Jun  9 22:36 007016.log
-rw-------@ 1 phil  staff   7755106 Jun  9 22:36 007017.ldb
-rw-------@ 1 phil  staff        16 May 28 21:05 CURRENT
-rw-------@ 1 phil  staff         0 May 28 21:05 LOCK
-rw-------@ 1 phil  staff     65298 Jun  9 22:36 LOG
-rw-------@ 1 phil  staff      1774 Jun  8 18:10 LOG.old
-rw-------@ 1 phil  staff    318794 Jun  9 22:36 MANIFEST-000001


### Initiate Database

In [61]:
db = plyvel.DB(leveldb_path)

# get database info

print(db.get_property(b'leveldb.stats'))


# filter all key entries that contain chat.openai.com 

for key, value in db:
    if b'chat.openai.com' in key:
        print(f'Key: {key}, Value: {value}')
        



b'                               Compactions\nLevel  Files Size(MB) Time(sec) Read(MB) Write(MB)\n--------------------------------------------------\n  0        3       22         0        0         7\n  1        2        8         0        0         0\n  2        2        7         0        0         0\n'


### Create dataframe

In [62]:

df = pd.DataFrame(columns=['key', 'value'])

for key, value in db:
    df = pd.concat([df, pd.DataFrame({'key': [key], 'value': [value]})], ignore_index=True)
    # df = df.append({'key': key, 'value': value}, ignore_index=True)



### Extract conversations

In [63]:
# read the value of key "conversations"  as json in a separate dataframe

df_conversations = pd.DataFrame(json.loads(df[df['key'] == b'conversations']['value'].values[0]))
# with column / lines transposed
df_conversations = df_conversations.T
# also export as text file
df_conversations.to_csv('df_conversations.txt', sep='\t', index=False)

# # add a column for the key value length
df['key_length'] = df['value'].str.len()

df.head()

# close the database
db.close()

### Extract messages

In [None]:

# drop all rows of df_conversations with empty mapping
df_conversations = df_conversations[df_conversations['mapping'].str.len() > 0]

# Create a new DataFrame for messages
df_messages = pd.DataFrame(columns=['id','conversation_id', 'title', 'message', 'role', 'create_time'])

# Iterate over the conversations and messages
for idx, conversation in df_conversations.iterrows():
    mapping = conversation['mapping']
    title = conversation['title']
    conversation_id = conversation['id']
    
    # Iterate over the messages in the mapping
    for message_id, message_data in mapping.items():
        if 'message' in message_data and message_data['message'] is not None:
            message = message_data['message']
            
            # Get the message text
            message_text = ''
            if 'content' in message and 'parts' in message['content']:
                message_parts = message['content']['parts']
                if len(message_parts) > 0:
                    message_text = message_parts[0]
            
            # Get the author role
            role = ''
            if 'author' in message and 'role' in message['author']:
                role = message['author']['role']
            
            # Get the create time
            create_time = ''
            if 'create_time' in message:
                create_time = message['create_time']
            
            # Append the message data to the DataFrame
            # df_messages = df_messages.append({
            #     'id': message_id,
            #     'title': title,
            #     'message': message_text,
            #     'role': role,
            #     'create_time': create_time
            # }, ignore_index=True)

            df_messages = pd.concat([df_messages, pd.DataFrame({
                'id': [message_id],
                'conversation_id': [conversation_id],
                'title': [title],
                'message': [message_text],
                'role': [role],
                'create_time': [create_time]
                })], ignore_index=True)
            


# Close the LevelDB database
db.close()

# Display the first few rows of the messages DataFrame
print(df_messages.head())




## Data Cleaning



In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# add more stop words in english and german
stop_words = stopwords.words('english')
# also exclude code words like < >, the, to, and =, a, in , of, for [ ] ( ) {} etc.
code_words = ['<', '>', 'the', 'to', 'and', '=', 'a', 'in', 'of', 'for', '[', ']', '(', ')', '{', '}', 'const', 'import', 'script', 'button', 'await', 'null', 'code', 'div', 'und', 'px', 'data', 'file', 'die', 'return', 'image', 'user', 'der', 'use', 'error', 'value', 'new', 'color', 'zu', 'create', 'using', 'component', 'add', 'false', 'object', 'template', 'name', 'da', 'also', 'app', 'example', 'span', 'für', 'width', 'mit', 'type', 'content', 'label', 'method', 'display', 'feedback', 'bike', 'rating', 'style', 'location', 'e', 'backgroundcolor', 'try', 'height', 'center', 'button', 'title', 'div', 'px', 'color', 'null', 'file', 'template', 'da', 'false', 'value', 'script', 'span', 'error', 'backgroundcolor', 'e', 'button', 'sie', 'ist', 'true', 'cycling', 'make', 'eine', 'class', 'default', 'auf', 'could', 'von', 'heres', 'like', 'name', 'function', 'used', 'i', 'need', 'async', 'based', 'label', 'data', 'advice', 'style', 'center', 'submission', 'infrastructure', 'set', 'model', 'property', 'width', 'type', 'id', 'ein', 'report', 'div', 'px', 'color', 'null', 'file', 'template', 'da', 'false', 'value', 'script', 'span', 'error', 'backgroundcolor', 'e', 'button', 'heres', 'name', 'i', 'label', 'data', 'style', 'center', 'width', 'type', 'index', 'text', 'vue', 'map', 'get', 'den', 'title', 'image', 'user', 'issue', 'array', 'f', 'column', 'comment', 'element', 'true', 'list', 'display', 'p', 'sure', 'im', 'result', 'height']

lemmatizer = WordNetLemmatizer()

df_messages['cleaned_text'] = df_messages['message'].str.lower()
# strip out words from stop_words and code_words from cleaned_text column
df_messages['cleaned_text'] = df_messages['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words + code_words)]))
df_messages['cleaned_text'] = df_messages['cleaned_text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
df_messages['cleaned_text'] = df_messages['cleaned_text'].str.replace('[^\w\s]','')
df_messages['cleaned_text'] = df_messages['cleaned_text'].str.replace('[\d+]','')

df_messages.head()

# based on df_messages['cleaned_text'] column create word frequency count 

from collections import Counter

# Create a list of all words in the messages
all_words = []

for idx, row in df_messages.iterrows():
    all_words.extend(row['cleaned_text'].split())

# Create a word frequency counter

word_freq = Counter(all_words)

# Display the 10 most common words

print(word_freq.most_common(10))

# Create a list of all words in the messages

all_words = []

# save the messages dataframe as txt

df_messages.to_csv('df_messages.txt', sep='\t', index=False)





[nltk_data] Downloading package stopwords to /Users/phil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/phil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  df_messages['cleaned_text'] = df_messages['cleaned_text'].str.replace('[^\w\s]','')
  df_messages['cleaned_text'] = df_messages['cleaned_text'].str.replace('[\d+]','')


[('div', 11351), ('px', 7869), ('color', 3109), ('null', 2996), ('file', 2646), ('template', 2567), ('da', 2412), ('false', 2331), ('value', 2319), ('script', 2258)]


In [None]:
# install pyvis and jinja2
# !pip install pyvis
# !pip install jinja2


# Dashboard

Initialize the streamlint dashboard

### Installations

In [None]:
# install streamlit
# !pip install streamlit

In [None]:
# install worldcloud
# !pip install wordcloud

In [None]:
# install networkx
# %pip install networkx

### Initialize Dashboard

In [None]:
# run streamlit app

code = """

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

# read custom_stop_words from txt file

custom_stop_words = []
with open('custom_stop_words.txt', 'r') as f:
    custom_stop_words = f.read().splitlines()


# Read the chat logs data into a DataFrame
df_messages = pd.read_csv('df_messages.txt', sep='	')
df_messages['cleaned_text'] = df_messages['cleaned_text'].astype(str)  # Convert 'cleaned_text' column to string type
# filter out code words in cleaned_text column 
df_messages['cleaned_text'] = df_messages['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in custom_stop_words]))

# Process the chat messages to extract words
all_words = ' '.join(df_messages['cleaned_text']).lower().split()
all_words = [word for word in all_words if word not in custom_stop_words]
word_counts = Counter(all_words)
most_common_words = word_counts.most_common(200)  # Change the number as per your requirement

# Create a DataFrame for the most common words
df_common_words = pd.DataFrame(most_common_words, columns=['Word', 'Count'])

# Display the most common words in a bar chart
fig, ax = plt.subplots()
df_common_words.plot.bar(x='Word', y='Count', ax=ax)
plt.xlabel('Word')
plt.ylabel('Count')
plt.title('Most Common Words in Chat Logs')
plt.xticks(rotation=45)
st.pyplot(fig)

# Display the raw data of the most common words
st.write(df_common_words)

# Display the raw data of the chat logs
st.write(df_messages)

# display a word cloud
from wordcloud import WordCloud

# Create a word cloud

wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=200).generate(' '.join(df_messages['cleaned_text']))

# Display the generated image:
# the matplotlib way:

fig, ax = plt.subplots()
ax.imshow(wordcloud, interpolation='bilinear')
ax.set_axis_off()
st.pyplot(fig)
"""

In [None]:
# Write the code to the dashboard.py file
with open('dashboard.py', 'w') as f:
    f.write(code)

In [None]:
# run streamlit app dashboard.py
!streamlit run dashboard.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8502[0m
[34m  Network URL: [0m[1mhttp://192.168.178.60:8502[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m
^C
[34m  Stopping...[0m


In [None]:

# close the database
db.close()