In [41]:
import json
import pandas as pd

In [42]:
# read servers index file
with open("package/servers/index.json", "r") as read_file:
    servers_index = json.load(read_file)

In [43]:
# build dim_servers table
dim_servers = pd.DataFrame.from_dict(servers_index, orient='index')

# rename columns
dim_servers.reset_index(inplace=True)
dim_servers.columns = ['server_id', 'server_name']

In [44]:
# check data types
dim_servers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   server_id    37 non-null     object
 1   server_name  37 non-null     object
dtypes: object(2)
memory usage: 720.0+ bytes


In [45]:
# convert types
dim_servers['server_id'] = pd.to_numeric(dim_servers['server_id'])
dim_servers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   server_id    37 non-null     int64 
 1   server_name  37 non-null     object
dtypes: int64(1), object(1)
memory usage: 720.0+ bytes


In [46]:
# write to csv
dim_servers.to_csv("dim_servers.csv", index=False)

In [47]:
# read messages index file
with open("package/messages/index.json", "r") as read_file:
    messages_index = json.load(read_file)

In [48]:
# build dim_channels table
dim_channels = pd.DataFrame.from_dict(messages_index, orient='index')

# rename columns
dim_channels.reset_index(inplace=True)
dim_channels.columns = ['channel_id', 'channel_name']

In [49]:
# add channel_type_key, server_id
channel_type_key_list = []
server_id_list = []

for (key, value) in messages_index.items():
    # create file path
    file_path_head = "package/messages/c" + key
    json_file_path = file_path_head + "/channel.json"
    
    # key is channel_id
    # value is channel_name
    
    # read json file
    with open(json_file_path, "r") as read_file:
        json_file = json.load(read_file)
        channel_type_key = json_file['type']
        try:
            server_id = json_file['guild']['id']
        except:
            server_id = None
            
    channel_type_key_list.append(channel_type_key)
    server_id_list.append(server_id)

dim_channels['channel_type_key'] = channel_type_key_list
dim_channels['server_id'] = server_id_list

In [50]:
# check data types
dim_channels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 764 entries, 0 to 763
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   channel_id        764 non-null    object
 1   channel_name      663 non-null    object
 2   channel_type_key  764 non-null    int64 
 3   server_id         343 non-null    object
dtypes: int64(1), object(3)
memory usage: 24.0+ KB


In [51]:
# convert types
dim_channels['channel_id'] = pd.to_numeric(dim_channels['channel_id'])
dim_channels['channel_type_key'] = pd.to_numeric(dim_channels['channel_type_key'])
dim_channels['server_id'] = pd.to_numeric(dim_channels['server_id'])
dim_channels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 764 entries, 0 to 763
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   channel_id        764 non-null    int64  
 1   channel_name      663 non-null    object 
 2   channel_type_key  764 non-null    int64  
 3   server_id         343 non-null    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 24.0+ KB


In [52]:
# write to csv
dim_channels.to_csv("dim_channels.csv", index=False)

In [53]:
# build fact_messages table
col_names = ['message_id', 'timestamp', 'contents', 'attachment_link']
fact_messages = pd.DataFrame(columns=col_names)

for (key, value) in messages_index.items():
    # create file path
    file_path_head = "package/messages/c" + key
    csv_file_path = file_path_head + "/messages.csv"
    
    # read csv file
    with open(csv_file_path, "rb") as read_file:
        df_temp = pd.read_csv(read_file)
   
    df_temp.columns = col_names
    
    # add channel_id col with key
    df_temp['channel_id'] = pd.Series(dtype='int64')
    df_temp['channel_id'].fillna(key, inplace=True)
    
    # append to main fact_messages file
    fact_messages = pd.concat([fact_messages, df_temp])

In [54]:
fact_messages.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54074 entries, 0 to 0
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   message_id       54074 non-null  object
 1   timestamp        54074 non-null  object
 2   contents         53023 non-null  object
 3   attachment_link  1236 non-null   object
 4   channel_id       54074 non-null  object
dtypes: object(5)
memory usage: 2.5+ MB


In [55]:
# convert types
fact_messages['message_id'] = pd.to_numeric(fact_messages['message_id'])
fact_messages['channel_id'] = pd.to_numeric(fact_messages['channel_id'])
fact_messages.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54074 entries, 0 to 0
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   message_id       54074 non-null  int64 
 1   timestamp        54074 non-null  object
 2   contents         53023 non-null  object
 3   attachment_link  1236 non-null   object
 4   channel_id       54074 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 2.5+ MB


In [56]:
fact_messages.to_csv("fact_messages.csv", index=False)

In [57]:
# create dim_channel_type
channel_type_key_list = [0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15]
channel_type_list = ['Server Channel', 'DM', 'Server Voice Text Channel', 'Group DM', 'Server Category', 'Server Annoucements', 'Server Announcment Thread', 'Server Channel Public Thread',
                         'Server Channel Private Thread', 'Stage Voice Channel', 'Server Channel Directory', 'Server Forum']
dim_channel_type = pd.DataFrame(data={'channel_type_key': channel_type_key_list,
                                 'channel_type': channel_type_list})

In [58]:
dim_channel_type

Unnamed: 0,channel_type_key,channel_type
0,0,Server Channel
1,1,DM
2,2,Server Voice Text Channel
3,3,Group DM
4,4,Server Category
5,5,Server Annoucements
6,10,Server Announcment Thread
7,11,Server Channel Public Thread
8,12,Server Channel Private Thread
9,13,Stage Voice Channel


In [59]:
dim_channel_type.to_csv("dim_channel_type.csv", index=False)