In [4]:
%%time
import ijson
import pandas as pd
from pathlib import Path

#file year 
year = 2025
file_path = f"new_amplitude_export_{year}.json"

#non-empty columns
columns_keep = [
    "$insert_id",
    "amplitude_id",
    "app",
    "city",
    "client_event_time",
    "client_upload_time",
    "country",
    "data",
    "data_type",
    "device_family",
    "device_id",
    "device_type",
    "dma",
    "event_id",
    "event_properties",
    "event_time",
    "event_type",
    "language",
    "library",
    "os_name",
    "os_version",
    "platform",
    "processed_time",
    "region",
    "server_received_time",
    "server_upload_time",
    "session_id",
    "user_id",
    "user_properties",
    "uuid",
]
path = Path(f"{year}_csv")
if not path.exists():
    path.mkdir(parents=True, exist_ok=True)
#use ijson to read the json files efficiently in memory
with open(file_path, "r") as f:
    objects = ijson.items(f, "item") #creates a generator object
    
    batch_size = 100000 #can be updated, currently saves per batches of 100,000
    chunk = []
    count = 0 #used to index batch file
    for obj in objects:
        chunk.append(obj)
        if len(chunk) >= batch_size:
            df = pd.DataFrame(chunk)
            output_csv = f"{year}_csv/{file_path.split('.')[0]}_chunk_{count*batch_size}_{(count+1)*batch_size}.csv"
            df = df[columns_keep] #remove empty columns
            df.to_csv(output_csv, index=False)
            count += 1
            chunk = []

    if chunk: #process remaining data if any
        output_csv = f"{year}_csv/{file_path.split('.')[0]}_chunk_{count*batch_size}_{(count+1)*batch_size}.csv"
        df = pd.DataFrame(chunk)
        df = df[columns_keep]
        df.to_csv(output_csv, index=False)


CPU times: user 2min 1s, sys: 38.4 s, total: 2min 40s
Wall time: 3min 13s


In [5]:
%%time
import ijson
import pandas as pd
from pathlib import Path

#file chunk 
part = 1
file_path = f"new_export/amplitude_export_chunk_{part}_anonymized.json"

#non-empty columns
columns_keep = [
    "$insert_id",
    "amplitude_id",
    "app",
    "city",
    "client_event_time",
    "client_upload_time",
    "country",
    "data",
    "data_type",
    "device_family",
    "device_id",
    "device_type",
    "dma",
    "event_id",
    "event_properties",
    "event_time",
    "event_type",
    "language",
    "library",
    "os_name",
    "os_version",
    "platform",
    "processed_time",
    "region",
    "server_received_time",
    "server_upload_time",
    "session_id",
    "user_id",
    "user_properties",
    "uuid",
]

path = Path(f"{part}_csv")
if not path.exists():
    path.mkdir(parents=True, exist_ok=True)

#use ijson to read the json files efficiently in memory
with open(file_path, "r") as f:
    objects = ijson.items(f, "item") #creates a generator object
    
    batch_size = 100000 #can be updated, currently saves per batches of 100,000
    chunk = []
    count = 0 #used to index batch file
    for obj in objects:
        chunk.append(obj)
        if len(chunk) >= batch_size:
            df = pd.DataFrame(chunk)
            output_csv = f"{part}_csv/{file_path.split('/')[1].split('.')[0]}_subchunk_{count*batch_size}_{(count+1)*batch_size}.csv"
            df = df[columns_keep] #remove empty columns
            df.to_csv(output_csv, index=False)
            count += 1
            chunk = []

    if chunk: #process remaining data if any
        output_csv = f"{part}_csv/{file_path.split('/')[1].split('.')[0]}_subchunk_{count*batch_size}_{(count+1)*batch_size}.csv"
        df = pd.DataFrame(chunk)
        print(df.shape)
        df = df[columns_keep]
        print(df.shape)
        df.to_csv(output_csv, index=False)


(83027, 54)
(83027, 30)
CPU times: user 54 s, sys: 16.6 s, total: 1min 10s
Wall time: 1min 19s


In [9]:
import pandas as pd
import numpy as new_export
import matplotlib.pyplot as plt
# if getting error message, make sure to install matplotlib using pip install
PATH = '1_csv/amplitude_export_chunk_1_anonymized_subchunk_0_100000.csv'
df = pd.read_csv(PATH)
df.head()

Unnamed: 0,$insert_id,amplitude_id,app,city,client_event_time,client_upload_time,country,data,data_type,device_family,...,os_version,platform,processed_time,region,server_received_time,server_upload_time,session_id,user_id,user_properties,uuid
0,b5aa91c6-ac17-4bfe-8def-df50869540b0,857540442426,591532,Covington,2024-06-14 23:06:34.898000,2024-06-14 23:06:35.998000,United States,"{'path': '/2/httpapi', 'group_first_event': {}...",event,Windows,...,125,Web,2024-06-14 23:06:36.809000,Georgia,2024-06-14 23:06:35.998000,2024-06-14 23:06:36.007000,1718399623706,01708ccf-437b-44ed-b5a0-2fed8f7761d3,"{'initial_utm_medium': 'EMPTY', 'initial_refer...",f10e3860-18f7-4e43-93d9-29bda5edb636
1,dc6c6dd1-8c20-4faf-8996-5fc532120b81,857540442426,591532,Covington,2024-06-14 23:06:34.926000,2024-06-14 23:06:35.998000,United States,"{'path': '/2/httpapi', 'group_first_event': {}...",event,Windows,...,125,Web,2024-06-14 23:06:36.809000,Georgia,2024-06-14 23:06:35.998000,2024-06-14 23:06:36.007000,1718399623706,01708ccf-437b-44ed-b5a0-2fed8f7761d3,"{'initial_utm_medium': 'EMPTY', 'initial_refer...",38e1d5ef-96c5-4a37-bd51-dcc9808b4c00
2,d4f63da1-2385-4683-b387-06b7e75d908b,857540442426,591532,Covington,2024-06-14 23:06:34.929000,2024-06-14 23:06:35.998000,United States,"{'path': '/2/httpapi', 'group_first_event': {}...",event,Windows,...,125,Web,2024-06-14 23:06:36.809000,Georgia,2024-06-14 23:06:35.998000,2024-06-14 23:06:36.007000,1718399623706,01708ccf-437b-44ed-b5a0-2fed8f7761d3,"{'initial_utm_medium': 'EMPTY', 'initial_refer...",39f12443-51ce-493e-a66a-19052bdbabd6
3,3bc7a189-4bc6-4fe7-a839-4c5c5da0e7e6,857540442426,591532,Covington,2024-06-14 23:06:34.929000,2024-06-14 23:06:35.998000,United States,"{'path': '/2/httpapi', 'group_first_event': {}...",event,Windows,...,125,Web,2024-06-14 23:06:36.809000,Georgia,2024-06-14 23:06:35.998000,2024-06-14 23:06:36.007000,1718399623706,01708ccf-437b-44ed-b5a0-2fed8f7761d3,"{'initial_utm_medium': 'EMPTY', 'initial_refer...",01776264-d664-4fce-8ce1-71b97ddba191
4,a1093d16-e49c-4006-aea7-1c04d5e668ca,857540442426,591532,Covington,2024-06-14 23:06:34.944000,2024-06-14 23:06:35.998000,United States,"{'path': '/2/httpapi', 'group_first_event': {}...",event,Windows,...,125,Web,2024-06-14 23:06:36.809000,Georgia,2024-06-14 23:06:35.998000,2024-06-14 23:06:36.007000,1718399623706,01708ccf-437b-44ed-b5a0-2fed8f7761d3,"{'initial_utm_medium': 'EMPTY', 'initial_refer...",963e97cb-3c8a-4efa-b52d-8713ca9b8459
