In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
# read servers index file
with open("package/servers/index.json", "r") as read_file:
    servers_index = json.load(read_file)


In [3]:
# build dim_servers table
dim_server = pd.DataFrame.from_dict(servers_index, orient="index")

# rename columns
dim_server.reset_index(inplace=True)
dim_server.columns = ["server_id", "server_name"]

In [4]:
# check data types
dim_server.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   server_id    68 non-null     object
 1   server_name  68 non-null     object
dtypes: object(2)
memory usage: 1.2+ KB


In [5]:
# convert types
dim_server["server_id"] = dim_server["server_id"].astype("Int64")
dim_server.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   server_id    68 non-null     Int64 
 1   server_name  68 non-null     object
dtypes: Int64(1), object(1)
memory usage: 1.3+ KB


In [6]:
# write to csv
dim_server.to_csv(
    "dim_server.csv",
    index=False,
)

In [7]:
del dim_server
del read_file
del servers_index


In [8]:
# read messages index file
with open("package/messages/index.json", "r") as read_file:
    messages_index = json.load(read_file)


In [9]:
# build dim_channel table
dim_channel = pd.DataFrame.from_dict(messages_index, orient="index")

# rename columns
dim_channel.reset_index(inplace=True)
dim_channel.columns = ["channel_id", "channel_name"]

In [10]:
# add channel_type_key, server_id
channel_type_key_list = []
server_id_list = []

for (key, value) in messages_index.items():
    # create file path
    file_path_head = "package/messages/c" + key
    json_file_path = file_path_head + "/channel.json"

    # key is channel_id
    # value is channel_name

    # read json file
    with open(json_file_path, "r") as read_file:
        json_file = json.load(read_file)
        channel_type_key = json_file["type"]
        try:
            server_id = json_file["guild"]["id"]
        except:
            server_id = None

    channel_type_key_list.append(channel_type_key)
    server_id_list.append(server_id)

dim_channel["channel_type_key"] = channel_type_key_list
dim_channel["server_id"] = server_id_list

In [11]:
# check data types
dim_channel.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941 entries, 0 to 940
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   channel_id        941 non-null    object
 1   channel_name      699 non-null    object
 2   channel_type_key  941 non-null    int64 
 3   server_id         317 non-null    object
dtypes: int64(1), object(3)
memory usage: 29.5+ KB


In [12]:
# convert types
dim_channel["channel_id"] = dim_channel["channel_id"].astype("Int64")
dim_channel["channel_type_key"] = dim_channel["channel_type_key"].astype(
    "Int64")
dim_channel["server_id"] = dim_channel["server_id"].astype("Int64")
dim_channel.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941 entries, 0 to 940
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   channel_id        941 non-null    Int64 
 1   channel_name      699 non-null    object
 2   channel_type_key  941 non-null    Int64 
 3   server_id         317 non-null    Int64 
dtypes: Int64(3), object(1)
memory usage: 32.3+ KB


In [13]:
# write to csv
dim_channel.to_csv("dim_channel.csv", index=False)


In [14]:
del channel_type_key
del channel_type_key_list
del dim_channel
del file_path_head
del json_file
del json_file_path
del key
del read_file
del server_id
del server_id_list
del value


In [15]:
# build fact_messages table
col_names = ["message_id", "timestamp", "contents", "attachment_link"]
fact_messages = pd.DataFrame(columns=col_names)

for (key, value) in messages_index.items():
    # create file path
    file_path_head = "package/messages/c" + key
    csv_file_path = file_path_head + "/messages.csv"

    # read csv file
    with open(csv_file_path, "rb") as read_file:
        df_temp = pd.read_csv(read_file)

    df_temp.columns = col_names

    # add channel_id col with key
    df_temp["channel_id"] = pd.Series(dtype="int64")
    df_temp["channel_id"].fillna(key, inplace=True)

    # append to main fact_messages file
    fact_messages = pd.concat([fact_messages, df_temp])

In [16]:
fact_messages.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 126092 entries, 0 to 2
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   message_id       126092 non-null  object
 1   timestamp        126092 non-null  object
 2   contents         123823 non-null  object
 3   attachment_link  2655 non-null    object
 4   channel_id       126092 non-null  object
dtypes: object(5)
memory usage: 5.8+ MB


In [17]:
# convert types
fact_messages["message_id"] = fact_messages["message_id"].astype("Int64")
fact_messages["channel_id"] = fact_messages["channel_id"].astype("Int64")
fact_messages.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 126092 entries, 0 to 2
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   message_id       126092 non-null  Int64 
 1   timestamp        126092 non-null  object
 2   contents         123823 non-null  object
 3   attachment_link  2655 non-null    object
 4   channel_id       126092 non-null  Int64 
dtypes: Int64(2), object(3)
memory usage: 6.0+ MB


In [18]:
fact_messages.to_csv("fact_messages.csv", index=False)


In [19]:
del col_names
del csv_file_path
del df_temp
del fact_messages
del file_path_head
del key
del messages_index
del read_file
del value


In [20]:
# create dim_channel_type
channel_type_key_list = [0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15]
channel_type_list = [
    "Server Channel",
    "DM",
    "Server Voice Text Channel",
    "Group DM",
    "Server Category",
    "Server Annoucements",
    "Server Announcment Thread",
    "Server Channel Public Thread",
    "Server Channel Private Thread",
    "Stage Voice Channel",
    "Server Channel Directory",
    "Server Forum",
]
dim_channel_type = pd.DataFrame(
    data={"channel_type_key": channel_type_key_list,
          "channel_type": channel_type_list}
)


In [21]:
dim_channel_type

Unnamed: 0,channel_type_key,channel_type
0,0,Server Channel
1,1,DM
2,2,Server Voice Text Channel
3,3,Group DM
4,4,Server Category
5,5,Server Annoucements
6,10,Server Announcment Thread
7,11,Server Channel Public Thread
8,12,Server Channel Private Thread
9,13,Stage Voice Channel


In [22]:
dim_channel_type.to_csv("dim_channel_type.csv", index=False)

In [23]:
del channel_type_key_list
del channel_type_list
del dim_channel_type


In [24]:
# create fact_analytics

json_list = []

with open(
    "package/activity/analytics/events-2022-00000-of-00001.json", "r", encoding="utf-8"
) as read_file:
    for json_obj in read_file:
        dict_temp = json.loads(json_obj)
        json_list.append(dict_temp)

In [25]:
# break into multiple lists
breakdown_list = []
chunk_size = 250000
list_len = len(json_list)

for i in range(0, list_len, chunk_size):
    if i + chunk_size < list_len:
        temp_list = json_list[i:i + chunk_size]
    else:
        temp_list = json_list[i:list_len]
    breakdown_list.append(temp_list)

In [26]:
del json_list
del dict_temp
del json_obj
del read_file


In [27]:
# store each list as its own csv file
length = len(breakdown_list)
for i in range(1, length+1, 1):
    list = breakdown_list.pop(0)
    df_temp = pd.DataFrame.from_dict(list)
    file_name = 'fact_analytics' + str(i) + '.csv'
    df_temp.to_csv(file_name, index=False)


In [28]:
del breakdown_list
del chunk_size
del df_temp
del file_name
del i
del length
del list
del list_len
del temp_list


In [29]:
# create fact_modeling
json_list = []

with open(
    "package/activity/modeling/events-2022-00000-of-00001.json", "r", encoding="utf-8"
) as read_file:
    for json_obj in read_file:
        dict_temp = json.loads(json_obj)
        json_list.append(dict_temp)

In [30]:
fact_modeling = pd.DataFrame.from_dict(json_list, orient="columns")

In [31]:
fact_modeling.head()

Unnamed: 0,event_type,event_id,event_source,user_id,domain,freight_hostname,ip,day,chosen_locale,detected_locale,...,num_guilds_recommended,num_guilds_popular,recommended_guild_ids,category_id,static_route,quantity,payment_modal_version,feed_item_type,active_events_shown,upcoming_events_shown
0,send_message,AQEEeMaeN7LgGi+m6iTXOLDyUgAdP/4=,api,342346882800025600,Modeling,discord-api-7fc4cb59bd-6scxz,103.253.105.0,1855,en-GB,en-GB,...,,,,,,,,,,
1,send_message,AQEElawFMUYV3IILbLXWUN3aWgAzKRI=,api,342346882800025600,Modeling,api-prd-main-sj45,14.192.208.0,1393,en-GB,en-GB,...,,,,,,,,,,
2,send_message,AQEEn79YxgtyTW7p/U/KFZVVzgE+KL8=,api,342346882800025600,Modeling,api-prd-main-p15l,14.192.214.0,1271,en-GB,en-GB,...,,,,,,,,,,
3,send_message,AQEEnywJE8wbsdRf4V5DtWgU2wDS3hM=,api,342346882800025600,Modeling,api-prd-main-4krc,14.192.214.0,1267,en-GB,en-GB,...,,,,,,,,,,
4,send_message,AQEExt4rlm/9JkoZb0Nl2xTIKQAITZI=,api,342346882800025600,Modeling,api-prd-main-d693,14.192.217.0,1473,en-GB,en-GB,...,,,,,,,,,,


In [32]:
fact_modeling.to_csv("fact_modeling.csv", index=False)

In [33]:
del json_list
del read_file
del json_obj
del dict_temp
del fact_modeling


In [34]:
# create fact_reporting
json_list = []

with open(
    "package/activity/reporting/events-2022-00000-of-00001.json", "r", encoding="utf-8"
) as read_file:
    for json_obj in read_file:
        dict_temp = json.loads(json_obj)
        json_list.append(dict_temp)

In [35]:
fact_reporting = pd.DataFrame.from_dict(json_list, orient="columns")

In [36]:
fact_reporting.head()

Unnamed: 0,event_type,event_id,event_source,user_id,domain,freight_hostname,ip,day,chosen_locale,detected_locale,...,settlement_fees,settlement_tax,login_source,removal_type,num_channels_highlighted,full,instant_invite,notifications_in_app_enabled,is_premium,impression_group
0,send_message,AQECVcvXZPjotGaE6cJJXVly+QAmzjg=,api,342346882800025600,Reporting,discord-api-8668c6c64f-76szd,103.253.105.0,1786,en-GB,en-GB,...,,,,,,,,,,
1,send_message,AQEC1nfTIOe7jEZUm9Zvq0MU5wCYp+k=,api,342346882800025600,Reporting,api-prd-main-2rvc,14.192.211.0,1456,en-GB,en-GB,...,,,,,,,,,,
2,send_message,AQEC1Q07SE3AoHYjZp0YIl3s+wA6+Xs=,api,342346882800025600,Reporting,discord-api-6b5587cdd-kg684,103.253.105.0,1884,en-GB,en-GB,...,,,,,,,,,,
3,send_message,AQECWhxO4OlYdt9LtC5A2+f5iQGnIfg=,api,342346882800025600,Reporting,discord-api-768499f95b-xgk4k,103.253.105.0,1775,en-GB,en-GB,...,,,,,,,,,,
4,send_message,AQEC9gPhJ16/o3QM+VEXoEvNPgAUSnU=,api,342346882800025600,Reporting,api-prd-main-mvh0,14.192.211.0,1450,en-GB,en-GB,...,,,,,,,,,,


In [37]:
fact_reporting.to_csv("fact_reporting.csv", index=False)

In [38]:
del json_list
del read_file
del json_obj
del dict_temp
del fact_reporting


In [39]:
# create fact_tns
json_list = []

with open(
    "package/activity/tns/events-2022-00000-of-00001.json", "r", encoding="utf-8"
) as read_file:
    for json_obj in read_file:
        dict_temp = json.loads(json_obj)
        json_list.append(dict_temp)

In [40]:
fact_tns = pd.DataFrame.from_dict(json_list, orient="columns")

In [41]:
fact_tns.head()

Unnamed: 0,event_type,event_id,event_source,user_id,domain,client_uuid,freight_hostname,freight_id,ip,day,...,payment_source_type,is_default,card_brand,card_expiration_date,step_duration_ms,flow_duration_ms,desktop_ip,desktop_country_code,country_match,abort_with_captcha
0,start_listening,AQMFhMhBL8SZO0YUjhdbm/2E1wAAAxs=,client,342346882800025600,Tns,AABCE61CwAQVTLxrUXhpHoEBAAAbAwAA,analytics-ingest-prd-mn2k,153Y7cSI37s9FO4WAM11Qg==,103.253.105.84,1764,...,,,,,,,,,,
1,start_listening,AQMFug5sBO3k0MAbSRDNOQJ+7QAAA2k=,client,342346882800025600,Tns,AABCE61CwAQvEvyLgCKXk4EBAABpAwAA,analytics-ingest-prd-ndw8,ej4kMneeDJexkPoWpgLd3A==,103.253.105.84,1787,...,,,,,,,,,,
2,start_listening,AQMF+ElFOpiHmf84royFTrxiWgAAAz0=,client,342346882800025600,Tns,AABCE61CwARu5QGi2id+fIEBAAA9AwAA,analytics-ingest-prd-q0pz,HiRxikGPr_LjE-4W5_eiyQ==,103.253.105.84,1782,...,,,,,,,,,,
3,start_listening,AQMF+ElFOpiHmf84royFTrxiWgAAAc0=,client,342346882800025600,Tns,AABCE61CwARu5QGi2id+fIEBAADNAQAA,analytics-ingest-prd-d494,9yPZ_VQ0xDY9FO4WffKUiw==,103.253.105.84,1781,...,,,,,,,,,,
4,start_listening,AQMFcbsrH1bpq3DQf1dyuWGHzgAAA+g=,client,342346882800025600,Tns,AABCE61CwASZPnrzTA4TTYEBAADoAwAA,analytics-ingest-prd-q0pz,HiRxikGPr_LjE-4WQBUZTw==,103.253.105.84,1772,...,,,,,,,,,,


In [42]:
fact_tns.to_csv("fact_tns.csv", index=False)

In [43]:
del json_list
del read_file
del json_obj
del dict_temp
del fact_tns
