In [1]:
import json
import pandas as pd


In [2]:
# read servers index file
with open("package/servers/index.json", "r") as read_file:
    servers_index = json.load(read_file)


In [3]:
# build dim_servers table
dim_server = pd.DataFrame.from_dict(servers_index, orient="index")

# rename columns
dim_server.reset_index(inplace=True)
dim_server.columns = ["server_id", "server_name"]

In [4]:
# check data types
dim_server.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   server_id    68 non-null     object
 1   server_name  68 non-null     object
dtypes: object(2)
memory usage: 1.2+ KB


In [5]:
# convert types
dim_server["server_id"] = dim_server["server_id"].astype("Int64")
dim_server.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   server_id    68 non-null     Int64 
 1   server_name  68 non-null     object
dtypes: Int64(1), object(1)
memory usage: 1.3+ KB


In [6]:
# write to csv
dim_server.to_csv(
    "dim_server.csv",
    index=False,
)

In [7]:
# read messages index file
with open("package/messages/index.json", "r") as read_file:
    messages_index = json.load(read_file)


In [8]:
# build dim_channel table
dim_channel = pd.DataFrame.from_dict(messages_index, orient="index")

# rename columns
dim_channel.reset_index(inplace=True)
dim_channel.columns = ["channel_id", "channel_name"]

In [9]:
# add channel_type_key, server_id
channel_type_key_list = []
server_id_list = []

for (key, value) in messages_index.items():
    # create file path
    file_path_head = "package/messages/c" + key
    json_file_path = file_path_head + "/channel.json"

    # key is channel_id
    # value is channel_name

    # read json file
    with open(json_file_path, "r") as read_file:
        json_file = json.load(read_file)
        channel_type_key = json_file["type"]
        try:
            server_id = json_file["guild"]["id"]
        except:
            server_id = None

    channel_type_key_list.append(channel_type_key)
    server_id_list.append(server_id)

dim_channel["channel_type_key"] = channel_type_key_list
dim_channel["server_id"] = server_id_list

In [10]:
# check data types
dim_channel.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941 entries, 0 to 940
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   channel_id        941 non-null    object
 1   channel_name      699 non-null    object
 2   channel_type_key  941 non-null    int64 
 3   server_id         317 non-null    object
dtypes: int64(1), object(3)
memory usage: 29.5+ KB


In [11]:
# convert types
dim_channel["channel_id"] = dim_channel["channel_id"].astype("Int64")
dim_channel["channel_type_key"] = dim_channel["channel_type_key"].astype("Int64")
dim_channel["server_id"] = dim_channel["server_id"].astype("Int64")
dim_channel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941 entries, 0 to 940
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   channel_id        941 non-null    Int64 
 1   channel_name      699 non-null    object
 2   channel_type_key  941 non-null    Int64 
 3   server_id         317 non-null    Int64 
dtypes: Int64(3), object(1)
memory usage: 32.3+ KB


In [12]:
# write to csv
dim_channel.to_csv("dim_channel.csv", index=False)


In [13]:
# build fact_messages table
col_names = ["message_id", "timestamp", "contents", "attachment_link"]
fact_messages = pd.DataFrame(columns=col_names)

for (key, value) in messages_index.items():
    # create file path
    file_path_head = "package/messages/c" + key
    csv_file_path = file_path_head + "/messages.csv"

    # read csv file
    with open(csv_file_path, "rb") as read_file:
        df_temp = pd.read_csv(read_file)

    df_temp.columns = col_names

    # add channel_id col with key
    df_temp["channel_id"] = pd.Series(dtype="int64")
    df_temp["channel_id"].fillna(key, inplace=True)

    # append to main fact_messages file
    fact_messages = pd.concat([fact_messages, df_temp])

In [14]:
fact_messages.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 126092 entries, 0 to 2
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   message_id       126092 non-null  object
 1   timestamp        126092 non-null  object
 2   contents         123823 non-null  object
 3   attachment_link  2655 non-null    object
 4   channel_id       126092 non-null  object
dtypes: object(5)
memory usage: 5.8+ MB


In [15]:
# convert types
fact_messages["message_id"] = fact_messages["message_id"].astype("Int64")
fact_messages["channel_id"] = fact_messages["channel_id"].astype("Int64")
fact_messages.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 126092 entries, 0 to 2
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   message_id       126092 non-null  Int64 
 1   timestamp        126092 non-null  object
 2   contents         123823 non-null  object
 3   attachment_link  2655 non-null    object
 4   channel_id       126092 non-null  Int64 
dtypes: Int64(2), object(3)
memory usage: 6.0+ MB


In [16]:
fact_messages.to_csv("fact_messages.csv", index=False)


In [17]:
# create dim_channel_type
channel_type_key_list = [0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15]
channel_type_list = [
    "Server Channel",
    "DM",
    "Server Voice Text Channel",
    "Group DM",
    "Server Category",
    "Server Annoucements",
    "Server Announcment Thread",
    "Server Channel Public Thread",
    "Server Channel Private Thread",
    "Stage Voice Channel",
    "Server Channel Directory",
    "Server Forum",
]
dim_channel_type = pd.DataFrame(
    data={"channel_type_key": channel_type_key_list, "channel_type": channel_type_list}
)

In [18]:
dim_channel_type

Unnamed: 0,channel_type_key,channel_type
0,0,Server Channel
1,1,DM
2,2,Server Voice Text Channel
3,3,Group DM
4,4,Server Category
5,5,Server Annoucements
6,10,Server Announcment Thread
7,11,Server Channel Public Thread
8,12,Server Channel Private Thread
9,13,Stage Voice Channel


In [19]:
dim_channel_type.to_csv("dim_channel_type.csv", index=False)

In [20]:
# create fact_activity

json_list = []

with open(
    "package/activity/reporting/events-2022-00000-of-00001.json", "r", encoding="utf-8"
) as read_file:
    for json_obj in read_file:
        dict_temp = json.loads(json_obj)
        json_list.append(dict_temp)

In [21]:
fact_activity = pd.DataFrame.from_dict(json_list, orient="columns")

In [22]:
fact_activity

Unnamed: 0,event_type,event_id,event_source,user_id,domain,freight_hostname,ip,day,chosen_locale,detected_locale,...,settlement_fees,settlement_tax,login_source,removal_type,num_channels_highlighted,full,instant_invite,notifications_in_app_enabled,is_premium,impression_group
0,send_message,AQECVcvXZPjotGaE6cJJXVly+QAmzjg=,api,342346882800025600,Reporting,discord-api-8668c6c64f-76szd,103.253.105.0,1786,en-GB,en-GB,...,,,,,,,,,,
1,send_message,AQEC1nfTIOe7jEZUm9Zvq0MU5wCYp+k=,api,342346882800025600,Reporting,api-prd-main-2rvc,14.192.211.0,1456,en-GB,en-GB,...,,,,,,,,,,
2,send_message,AQEC1Q07SE3AoHYjZp0YIl3s+wA6+Xs=,api,342346882800025600,Reporting,discord-api-6b5587cdd-kg684,103.253.105.0,1884,en-GB,en-GB,...,,,,,,,,,,
3,send_message,AQECWhxO4OlYdt9LtC5A2+f5iQGnIfg=,api,342346882800025600,Reporting,discord-api-768499f95b-xgk4k,103.253.105.0,1775,en-GB,en-GB,...,,,,,,,,,,
4,send_message,AQEC9gPhJ16/o3QM+VEXoEvNPgAUSnU=,api,342346882800025600,Reporting,api-prd-main-mvh0,14.192.211.0,1450,en-GB,en-GB,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
615774,send_message,AQEClhCi2GNUhhGFg1ZJFbKPlgA8qFY=,api,342346882800025600,Reporting,discord-api-7f77759854-w9swx,103.253.105.0,1830,en-GB,en-GB,...,,,,,,,,,,
615775,send_message,AQECIyi89NjuVwo9WRCl/RjhPgBWuxg=,api,342346882800025600,Reporting,discord-api-69db956dd7-vhrcq,103.253.105.0,1890,en-GB,en-GB,...,,,,,,,,,,
615776,send_message,AQEC00nEQOWVgyz7OdCsZD7PqAAk5TQ=,api,342346882800025600,Reporting,discord-api-b69489cd9-zcqkl,103.253.105.0,1760,en-GB,en-GB,...,,,,,,,,,,
615777,send_message,AQECsSr/1/8IpqWtRMwUp+WZMwA/20Y=,api,342346882800025600,Reporting,discord-api-7b96fbdfb8-kj77p,113.211.120.0,1902,en-GB,en-GB,...,,,,,,,,,,


In [23]:
fact_activity.to_csv("fact_activity.csv", index=False)