# ChatGPT History Export Parser

Simple parser code to convert your ChatGPT Conversation History into a CSV for personal analysis.

✔ Export your ChatGPT history and data following OpenAI's documentation [here](https://help.openai.com/en/articles/7260999-how-do-i-export-my-chatgpt-history-and-data).

-----

## Dependencies

In [19]:
# import argparse
import json
import os
import re
from collections import defaultdict
from typing import Any

from tqdm import tqdm

from datetime import datetime
# from datetime import date, datetime as dt, timedelta as td
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [20]:
# check python version
# import sys
# sys.version_info

-----

## Basic Configurations inc File Paths for Conversations and Output

In [21]:
json_filepath = 'data/conversations.json'

In [22]:
out_folder = 'output'

-----

## Simple Data Check and Data Analysis

In [23]:
with open(json_filepath, "r") as file:
    conversations = json.load(file)

In [24]:
# Uncomment to view single raw convo example
# conversations[14]

In [25]:
title_occurrences: defaultdict[str, int] = defaultdict(int)
total_conversations: int = len(conversations)

In [26]:
print("Total Conversations =", total_conversations)

Total Conversations = 213


----

## Parser and Export to CSV

In [27]:
# Config
user_name = "Me" # user_name = "Mark Koester"
assistant_name = "ChatGPT"
date_format = "%m-%d-%Y"
file_name_format = "_{title}"
include_date = True
message_separator = "\n\n"
skip_empty_messages = True

In [28]:
# type(conversations)
# conversations = conversations[5:6]
len(conversations)

213

In [29]:
def process_conversations_to_df(data):
    convo_msgs = []
    for conversation in tqdm(data, desc="Processing conversations"):
        title = conversation["title"]
        mapping = conversation["mapping"]

        # Extract messages from the "mapping" key
        messages = [mapping[key]["message"] for key in mapping if mapping[key]["message"] is not None]

        # Sort messages by their create_time
        messages.sort(key=lambda x: x["create_time"] if x["create_time"] is not None else float('-inf'))

        if messages and messages[0]["create_time"] is not None:
            conversation_start = datetime.fromtimestamp(messages[0]["create_time"]).strftime(date_format)
        # conversation_start = datetime.fromtimestamp(messages[0]["create_time"]).strftime(date_format)

        msg_list = []
        for message in messages:
            author_role = message["author"]["role"]
            msg_content = message["content"]["parts"][0]
            msg_date = ''
            if message["create_time"] is not None:
                msg_date = datetime.fromtimestamp(message["create_time"]).strftime('%Y-%m-%d %H:%M:%S')
                # msg_date = datetime.fromtimestamp(message["create_time"])
            author_name = user_name if author_role == "user" else assistant_name

            # print(title + " on " + msg_date + " - " +author_name + " " +  msg_content)
            
            msg = {
                'conversation_title': title,
                'author': author_name,
                # 'conversation_date': conversation_start,
                'message': msg_content,
                'msg_date': msg_date
            }
            msg_list.append(msg)
        convo_msgs.extend(msg_list)
        
    #convo_msgs.append(msg_list)
    convo_msgs_df = pd.DataFrame(convo_msgs)
    return convo_msgs_df

In [30]:
# process_conversations_to_df(conversations)

In [31]:
convo_msgs = process_conversations_to_df(conversations)

Processing conversations: 100%|████████████| 213/213 [00:00<00:00, 23102.24it/s]


In [32]:
# convo_msgs.head()

In [33]:
len(convo_msgs)

2834

In [34]:
convo_msgs.to_csv("data/chatgpt_messages.csv", index=None, encoding='utf-8')

----

## TODO: Exporter to Markdown Files per Conversation

In [35]:
# Pre-compiled pattern for disallowed characters in file names
DISALLOWED_CHARS_PATTERN = re.compile(r'[<>:"/\\|?*\n\r\t\f\v]')

In [36]:
def process_conversations_to_markdown(data, output_dir):
    for conversation in tqdm(data, desc="Processing conversations"):
        title = conversation["title"]
        mapping = conversation["mapping"]

        # Extract messages from the "mapping" key
        messages = [mapping[key]["message"] for key in mapping if mapping[key]["message"] is not None]

        # Sort messages by their create_time
        messages.sort(key=lambda x: x["create_time"] if x["create_time"] is not None else float('-inf'))

        # sanitize title to ensure it's a valid filename
        title = ''.join(c for c in title if c.isalnum() or c in [' ', '_']).rstrip()
        file_name = f"{file_name_format.format(title=title.replace(' ', '_').replace('/', '_'))}.md"
        file_path = os.path.join(output_dir, file_name)

        with open(file_path, "w", encoding="utf-8") as f:
            if messages and messages[0]["create_time"] is not None and include_date:
                date = datetime.fromtimestamp(messages[0]["create_time"]).strftime(date_format)
                f.write(f"<sub>{date}</sub>{message_separator}")

            for message in messages:
                author_role = message["author"]["role"]
                content = message["content"]["parts"][0]
                # msg_date = datetime.fromtimestamp(message["create_time"]).strftime(date_format)
                msg_date = message["create_time"]
                author_name = user_name if author_role == "user" else assistant_name
                if not skip_empty_messages or content.strip():
                    f.write(f"**{author_name}** on msg_date: {content}{message_separator}")