## Imports


In [1]:
import os
import json
import pandas as pd
import psycopg2
from psycopg2 import OperationalError
from psycopg2 import ProgrammingError
import gc

## Data Connection


In [2]:
# read user file
with open("package/account/user.json", "r") as read_file:
    user = json.load(read_file)

In [3]:
# function to connect to postgresql db
def create_connection(db_name, db_user, db_password, db_host, db_port):
    try:
        connection = psycopg2.connect(
            database=db_name,
            user=db_user,
            password=db_password,
            host=db_host,
            port=db_port,
        )
        print("Connection to PostgreSQL DB successful")
    except OperationalError as e:
        connection = None
        print(f"The error '{e}' occurred")
        print("Error Type:", type(e))

    return connection

In [4]:
def read_secrets() -> dict:
    filename = os.path.join("secrets.json")
    try:
        with open(filename, mode="r") as f:
            return json.loads(f.read())
    except FileNotFoundError:
        return {}

In [5]:
secrets = read_secrets()


In [6]:
# set up database connection
con = create_connection(
    secrets["db_name"],
    secrets["db_user"],
    secrets["db_password"],
    secrets["db_host"],
    secrets["db_port"],
)


Connection to PostgreSQL DB successful


In [7]:
cur = con.cursor()


In [8]:
# function to select data, returns a dataframe
def select(query):
    data_df = None
    try:
        cur.execute(query)
        col_names = [desc[0] for desc in cur.description]
        data_df = pd.DataFrame(data=cur.fetchall(), columns=col_names)
    except ProgrammingError as e:
        con.rollback()
        print(f"The error '{e}' occurred")
        print("Error Type:", type(e))

    return data_df

## Import Data


In [9]:
select_query = """
SELECT *
FROM discord_data.dim_channel
"""

df_dim_channel = select(select_query)


In [10]:
select_query = """
SELECT *
FROM discord_data.dim_channel_type
"""

df_dim_channel_type = select(select_query)


In [11]:
select_query = """
SELECT *
FROM discord_data.dim_server
"""

df_dim_server = select(select_query)


In [12]:
select_query = """
SELECT *
FROM discord_data.fact_messages
"""

df_fact_messages = select(select_query)


In [13]:
del con
del cur
del read_file
del secrets
del select_query
del user
gc.collect()

0

In [14]:
# function to downcast all numeric columns and change all object columns to category
def downcast(input_df):
    print('Memory Usage before downcasting:')
    print(input_df.info(memory_usage='deep'))

    for column in input_df:
        if input_df[column].dtype == 'float64':
            input_df[column] = pd.to_numeric(
                input_df[column], downcast='float')
        if input_df[column].dtype == 'int64':
            input_df[column] = pd.to_numeric(
                input_df[column], downcast='integer')
        if input_df[column].dtype == 'object':
            input_df[column] = input_df[column].astype(
                'category')

    print('Memory Usage after downcasting:')
    print(input_df.info(memory_usage='deep'))

    return input_df

In [15]:
# read from csv (due to it being semi structured data)
# may transition to a NoSQL database in the future
# store each list as its own csv file
# length = 6
# df_fact_analytics = pd.DataFrame()
# for i in range(1, length+1, 1):
#     file_name = 'fact_analytics' + str(i) + '.csv'
#     df_temp = pd.read_csv(file_name)
#     df_fact_analytics = pd.concat([df_fact_analytics, df_temp], copy=False)
#     print(df_fact_analytics.shape)

# del df_temp

# not running due to memory limitations

In [16]:
# df_fact_analytics.head()

In [17]:
# read from csv (due to it being semi structured data)
# may transition to a NoSQL database in the future
df_fact_modeling = pd.read_csv("fact_modeling.csv")

  df_fact_modeling = pd.read_csv("fact_modeling.csv")


In [18]:
df_fact_modeling = downcast(df_fact_modeling)

Memory Usage before downcasting:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884392 entries, 0 to 884391
Columns: 840 entries, event_type to is_premium_member
dtypes: float64(442), int64(2), object(396)
memory usage: 14.4 GB
None
Memory Usage after downcasting:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884392 entries, 0 to 884391
Columns: 840 entries, event_type to is_premium_member
dtypes: category(396), float32(385), float64(57), int16(1), int64(1)
memory usage: 2.4 GB
None


In [19]:
df_fact_modeling.head()

Unnamed: 0,event_type,event_id,event_source,user_id,domain,freight_hostname,ip,day,chosen_locale,detected_locale,...,invite_channel_type,is_suggested,row_num,num_total,is_filtered,num_affinity_connections,send_type,role_subscription_group_listing_id,role_subscription_listing_ids,is_premium_member
0,open_popout,AQMETS3lsxQMTcnKBXSSIJuTXwAAARA=,client,342346882800025600,Modeling,analytics-ingest-prd-dnq6,14.192.209.0,1433,en-GB,en-US,...,,,,,,,,,,
1,open_popout,AQMETS3lsxQMTcnKBXSSIJuTXwAAAOg=,client,342346882800025600,Modeling,analytics-ingest-prd-50xt,14.192.209.0,1433,en-GB,en-US,...,,,,,,,,,,
2,open_popout,AQMEeInHNDc6ZoHEXkpHLouIvgAAACw=,client,342346882800025600,Modeling,analytics-ingest-prd-zjk4,103.253.105.0,1581,en-GB,en-US,...,,,,,,,,,,
3,open_popout,AQEENTa24ETKie63CPZELpoetk9NzOA=,client,342346882800025600,Modeling,analytics-ingest-prd-l5gj,175.143.61.0,1150,,en-US,...,,,,,,,,,,
4,open_popout,AQEEDMhX5SkGx7oSxOeqENmYuGTYcZY=,client,342346882800025600,Modeling,analytics-ingest-prd-gqn5,14.192.212.0,1258,,en-US,...,,,,,,,,,,


In [20]:
# read from csv (due to it being semi structured data)
# may transition to a NoSQL database in the future
df_fact_reporting = pd.read_csv("fact_reporting.csv")

  df_fact_reporting = pd.read_csv("fact_reporting.csv")


In [21]:
df_fact_reporting = downcast(df_fact_reporting)

Memory Usage before downcasting:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631680 entries, 0 to 631679
Columns: 640 entries, event_type to num_applications_gog
dtypes: float64(303), int64(2), object(335)
memory usage: 8.5 GB
None
Memory Usage after downcasting:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631680 entries, 0 to 631679
Columns: 640 entries, event_type to num_applications_gog
dtypes: category(335), float32(249), float64(54), int16(1), int64(1)
memory usage: 1.3 GB
None


In [22]:
df_fact_reporting.head()

Unnamed: 0,event_type,event_id,event_source,user_id,domain,freight_hostname,ip,day,chosen_locale,detected_locale,...,deeplink_source,link_type,num_applications_total,num_applications_battlenet,num_applications_discord,num_applications_steam,num_applications_twitch,num_applications_uplay,num_applications_origin,num_applications_gog
0,send_message,AQECpEkS/Zt1ZnmYLv6+1SQz+AAsKbk=,api,342346882800025600,Reporting,discord-api-7fc4cb59bd-fq45s,103.253.105.0,1855,en-GB,en-GB,...,,,,,,,,,,
1,send_message,AQECsEQ/SzYXFt3mJNphb896bABDpOU=,api,342346882800025600,Reporting,discord-api-8668c6c64f-28hwt,103.253.105.0,1786,en-GB,en-GB,...,,,,,,,,,,
2,send_message,AQECnoITHkGwoJeGW6v6cMtBcAAcd5I=,api,342346882800025600,Reporting,discord-api-8d47878f4-bz68m,103.253.105.0,1882,en-GB,en-GB,...,,,,,,,,,,
3,send_message,AQECoYnQsaKP11phtENl4D4TtwA4lk0=,api,342346882800025600,Reporting,discord-api-5bbbc7b7fd-j958j,103.253.105.0,1807,en-GB,en-GB,...,,,,,,,,,,
4,send_message,AQECsPMdnBCWr4xCKLo5tl1NnQAwif8=,api,342346882800025600,Reporting,discord-api-7fc4cb59bd-8wwk5,103.253.105.0,1855,en-GB,en-GB,...,,,,,,,,,,


In [23]:
# read from csv (due to it being semi structured data)
# may transition to a NoSQL database in the future
df_fact_tns = pd.read_csv("fact_tns.csv")

  df_fact_tns = pd.read_csv("fact_tns.csv")


In [24]:
df_fact_tns = downcast(df_fact_tns)

Memory Usage before downcasting:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299924 entries, 0 to 299923
Columns: 1030 entries, event_type to is_greeting
dtypes: float64(571), int64(2), object(457)
memory usage: 5.8 GB
None
Memory Usage after downcasting:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299924 entries, 0 to 299923
Columns: 1030 entries, event_type to is_greeting
dtypes: category(457), float32(511), float64(60), int16(1), int64(1)
memory usage: 1.0 GB
None


In [25]:
df_fact_tns.head()

Unnamed: 0,event_type,event_id,event_source,user_id,domain,freight_hostname,freight_id,ip,day,chosen_locale,...,account_id,account_name,connected,visibility,friend_sync,partner,link_method,two_way_link,metadata_visibility,is_greeting
0,user_phone_updated,AQEFOgYw9XUViwEa4YXTjqFZwAEaNDQ=,api,342346882800025600,Tns,discord-api-9869d5fcc-lkwnc,UZh5EmxHMAJjj_kWNDQaAQ==,103.253.105.84,1781,en-GB,...,,,,,,,,,,
1,user_phone_updated,AQEFh45YBraYxFbvwxkzv1TgLgAHCww=,api,342346882800025600,Tns,discord-api-5695f55cb5-xqq4x,Gze9hEhXDjDyhwMXDAsHAA==,103.253.105.84,1812,en-GB,...,,,,,,,,,,
2,guild_viewed,AQMFeYhBPLDjh+cE4sxgHk5ipwAABAM=,client,342346882800025600,Tns,analytics-ingest-prd-q0pz,HiRxikGPr_LjE-4W8dk0ig==,103.253.105.84,1757,en-GB,...,,,,,,,,,,
3,guild_viewed,AQMFeYhBPLDjh+cE4sxgHk5ipwAABEs=,client,342346882800025600,Tns,analytics-ingest-prd-424c,V-dmfzra_D09FO4Wobpeiw==,103.253.105.84,1757,en-GB,...,,,,,,,,,,
4,guild_viewed,AQMFeYhBPLDjh+cE4sxgHk5ipwAAA1A=,client,342346882800025600,Tns,analytics-ingest-prd-h4p0,26SaCyp1G_3jE-4W3ggrWA==,103.253.105.84,1756,en-GB,...,,,,,,,,,,


In [26]:
%store df_dim_channel
%store df_dim_channel_type
%store df_dim_server
%store df_fact_messages
# %store df_fact_analytics
%store df_fact_modeling
%store df_fact_reporting
%store df_fact_tns


Stored 'df_dim_channel' (DataFrame)
Stored 'df_dim_channel_type' (DataFrame)
Stored 'df_dim_server' (DataFrame)
Stored 'df_fact_messages' (DataFrame)
Stored 'df_fact_modeling' (DataFrame)
Stored 'df_fact_reporting' (DataFrame)
Stored 'df_fact_tns' (DataFrame)


## Analysis


In [27]:
%store -r

## Ideas

- Check readability of text (texttatistic)
-