# Data Engineering Challenge

Import libraries

In [25]:
import requests
import json
import pandas as pd
import sqlite3

## Data Ingestion

Obtain the JSON data from the GitHub repository

In [50]:
url = ' https://raw.githubusercontent.com/mattcattaneo21/data-eng-challenge/refs/heads/main/mock_event_logs.json'
response = requests.get(url)
data = response.json()

print(str(len(data)) + ' were uploaded')
print(data[0])

500 were uploaded
{'event_id': 'f3849d71-d832-4777-b097-e38dd89477a8', 'timestamp': '2025-05-31T14:36:20.971025', 'event_type': 'comment_added', 'user_id': 'user_1', 'document_id': 'doc_4', 'comment_text': 'Comment 46'}


Convert json file into a dataframe

In [6]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,event_id,timestamp,event_type,user_id,document_id,comment_text,shared_with,edit_length
0,f3849d71-d832-4777-b097-e38dd89477a8,2025-05-31T14:36:20.971025,comment_added,user_1,doc_4,Comment 46,,
1,37b91816-ac0f-45f2-923a-235c99dd7ddd,2025-06-04T19:35:09.971025,document_shared,user_9,doc_1,,user_5,
2,1752d4b4-2031-4654-a91c-a286fec1209a,2025-06-07T05:19:00.971025,user_login,user_1,doc_1,,,
3,1ae41ba5-f302-4fe7-b7b5-a8819880da90,2025-05-31T10:22:06.971025,document_edit,user_4,doc_1,,,787.0
4,14c274fc-b403-4cff-baa1-daeacd839b2f,2025-05-19T21:31:38.971025,document_edit,user_10,doc_2,,,151.0


## Data Transformation

Convert the dataframe into 3 normalized tables:

In [11]:
#Users table

users_df = pd.DataFrame(df['user_id'].unique(), columns=['user_id'])

print(f'{len(users_df)} unique users\n')
users_df.head()

10 unique users



Unnamed: 0,user_id
0,user_1
1,user_9
2,user_4
3,user_10
4,user_6


In [13]:
#Documents table

documents_df = pd.DataFrame(df['document_id'].unique(), columns=['document_id'])

print(f'{len(documents_df)} unique documents\n')
documents_df.head()

5 unique documents



Unnamed: 0,document_id
0,doc_4
1,doc_1
2,doc_2
3,doc_5
4,doc_3


In [14]:
#Events table

events_df = df.copy()

events_df['timestamp'] = pd.to_datetime(events_df['timestamp'])

events_df = events_df.sort_values(by='timestamp').reset_index(drop=True)

events_df.head()

Unnamed: 0,event_id,timestamp,event_type,user_id,document_id,comment_text,shared_with,edit_length
0,47f3543b-8da9-4ff9-ada4-3881d570f76c,2025-05-11 19:26:46.971025,document_shared,user_2,doc_4,,user_5,
1,572899d5-6e9d-4002-ab0d-b11886db96d3,2025-05-11 19:29:05.971025,document_edit,user_5,doc_5,,,665.0
2,c801b6d4-c948-42be-83d6-624fc44f4be6,2025-05-11 20:38:21.971025,document_shared,user_2,doc_1,,user_4,
3,209d307b-c877-46f2-9a93-45e243259f22,2025-05-11 21:31:28.971025,comment_added,user_2,doc_5,Comment 7,,
4,2a26fc12-9eb4-4878-bad1-b7a094214b45,2025-05-12 00:02:30.971025,user_login,user_7,doc_4,,,


### Data cleansing and enrichment

In [16]:
print(f'Total events: {len(events_df)}')

events_df = events_df.drop_duplicates(subset='event_id')

print(f'Number of events without duplicates: {len(events_df)}')

Total events: 500
Number of events without duplicates: 500


In [51]:
#Events with empty values
malformed = events_df[events_df[['event_type', 'user_id', 'document_id']].isnull().any(axis=1)]
print(f'Events with issues: {len(malformed)}')

# Dropping malformed events
events_df = events_df.dropna(subset=['event_type', 'user_id', 'document_id'])

Events with issues: 0


### Addition of derived columns

In [52]:
#Day of week
events_df['day_of_week'] = events_df['timestamp'].dt.day_name()
events_df[['timestamp', 'day_of_week']].head()

Unnamed: 0,timestamp,day_of_week
0,2025-05-11 19:26:46.971025,Sunday
1,2025-05-11 19:29:05.971025,Sunday
2,2025-05-11 20:38:21.971025,Sunday
3,2025-05-11 21:31:28.971025,Sunday
4,2025-05-12 00:02:30.971025,Monday


In [53]:
#Session duration
#Filter login events
logins = events_df[events_df['event_type'] == 'user_login'].copy()

#Ordering by user/timestamp
logins = logins.sort_values(['user_id', 'timestamp'])

#Calculate session duration
logins['session_duration'] = logins.groupby('user_id')['timestamp'].diff().shift(-1)
logins[['user_id', 'timestamp', 'session_duration']].head()

Unnamed: 0,user_id,timestamp,session_duration
69,user_1,2025-05-15 07:43:22.971025,0 days 00:47:15
70,user_1,2025-05-15 08:30:37.971025,5 days 11:26:35
165,user_1,2025-05-20 19:57:12.971025,1 days 07:14:12
186,user_1,2025-05-22 03:11:24.971025,4 days 02:34:54
247,user_1,2025-05-26 05:46:18.971025,0 days 15:25:46


In [54]:
#Document word count
def doc_word_count(edit_length):
    if pd.isna(edit_length):
        return 0
    else:
        return int(edit_length)

events_df['document_word_count'] = events_df['edit_length'].apply(doc_word_count)
print(events_df['document_word_count'])

0        0
1      665
2        0
3        0
4        0
      ... 
495      0
496    673
497      0
498      0
499      0
Name: document_word_count, Length: 500, dtype: int64


## Data Storage

In [57]:
conn = sqlite3.connect('events_data.db')

users_df.to_sql('users', conn, if_exists='replace', index=False)
documents_df.to_sql('documents', conn, if_exists='replace', index=False)
events_df.to_sql('events', conn, if_exists='replace', index=False)

print("Data uploaded to SQLite")

Data uploaded to SQLite


In [27]:
#Remove the comment of the next to lines to download the database
#from google.colab import files
#files.download('events_data.db')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [58]:
#Checking that the table is showing values correctly
#Connect to the database
conn = sqlite3.connect('events_data.db')

#Query to show the unique users
query = "SELECT * FROM users"
df_users = pd.read_sql_query(query, conn)
df_users.head()

Unnamed: 0,user_id
0,user_1
1,user_9
2,user_4
3,user_10
4,user_6


##Analytics

Daily Active Users over the last 30 days

In [61]:
query = """
SELECT
    DATE(timestamp) as day,
    COUNT(DISTINCT user_id) as daily_active_users
FROM events
WHERE DATE(timestamp) >= DATE('now', '-30 day')
GROUP BY day
ORDER BY day DESC
"""
pd.read_sql_query(query, conn)

Unnamed: 0,day,daily_active_users
0,2025-06-10,7
1,2025-06-09,9
2,2025-06-08,9
3,2025-06-07,7
4,2025-06-06,7


Top 10 most edited documents

In [62]:
query = """
SELECT
    document_id,
    COUNT(*) as edit_count
FROM events
WHERE event_type = 'document_edit'
GROUP BY document_id
ORDER BY edit_count DESC
LIMIT 10
"""
pd.read_sql_query(query, conn)

Unnamed: 0,document_id,edit_count
0,doc_4,27
1,doc_2,25
2,doc_5,24
3,doc_1,22
4,doc_3,19


Number of shared documents per user

In [60]:
query = """
SELECT
    user_id,
    COUNT(*) as shared_documents
FROM events
WHERE event_type = 'document_shared'
GROUP BY user_id
ORDER BY shared_documents DESC
"""
pd.read_sql_query(query, conn)


Unnamed: 0,user_id,shared_documents
0,user_7,21
1,user_9,18
2,user_6,18
3,user_10,15
4,user_8,12
5,user_5,10
6,user_3,10
7,user_4,9
8,user_2,8
9,user_1,8
