In [1]:
# create and connect db
import sqlite3, pandas as pd
conn = sqlite3.connect("etl_pipeline.db")

In [2]:
master_json_df = pd.read_json("cleaned/master.json")
master_json_df.to_sql("master", con=conn, index=False, if_exists="replace") # write master table record into sqlite

5

In [3]:
master_json_df.dtypes

user_id                 int64
name                   object
signup_date            object
txn_id                  int64
amount                float64
timestamp      datetime64[ns]
dtype: object

In [4]:
# SQL query summary: 
# Total spend per user
spend_per_user = pd.read_sql('SELECT user_id, name, SUM(amount) as total_spend FROM master GROUP BY user_id, name ORDER BY total_spend DESC', conn) 
spend_per_user.head()

Unnamed: 0,user_id,name,total_spend
0,4,Mallory,200.0
1,1,Alice,195.7
2,3,Eve,150.0
3,2,Bob,85.0


In [5]:
# Weekly revenue
weekly_revenue = pd.read_sql("SELECT strftime('%Y', timestamp) as year,\
                                     strftime('%W', timestamp) as week,\
                                     SUM(amount) as revenue\
                                     FROM master\
                              GROUP BY year, week\
                              ORDER BY year, week", conn)
weekly_revenue.head()

Unnamed: 0,year,week,revenue
0,2025,21,120.5
1,2025,22,310.2
2,2025,23,200.0


In [6]:
users = pd.read_csv("cleaned/users_clean.csv")
users.to_sql("users",con= conn, index=False, if_exists="replace") # Write users records to a SQLite

5

In [7]:
# Users with no transactions
sql_result = pd.read_sql("SELECT u.name, COALESCE(m.amount,0) as amount FROM users as u LEFT JOIN master as m ON u.user_id = m.user_id WHERE m.amount IS NULL", conn)

In [8]:
import os 
os.makedirs("output_summary/", exist_ok=True)
sql_result.to_csv("output_summary/users_no_txn.csv", index=False)