In [1]:

!pip install mysql-connector-python cassandra-driver SQLAlchemy pandas


import pandas as pd
import uuid
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from sqlalchemy import create_engine


user = "analyst"
password = "password"
host = "host.docker.internal"
port = 3306
database = "analytics_db"
uri = f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}"
engine = create_engine(uri)

# Cassandra connection
cluster = Cluster(['cassandra'])  
session = cluster.connect('ad_data')




Ad campaign performance over time (CTR, total impressions, total clicks per day)

In [2]:

query = """
SELECT
    cam.name AS campaign_name,
    COUNT(fe.event_id) AS total_impressions,
    COALESCE(SUM(fe.was_clicked), 0) AS total_clicks,
    COALESCE(SUM(fe.was_clicked) / NULLIF(COUNT(fe.event_id), 0), 0) AS ctr
FROM analytics_db.dim_campaign AS cam
LEFT JOIN analytics_db.fact_event AS fe
ON cam.campaign_id = fe.campaign_id
GROUP BY cam.name
"""


df = pd.read_sql(query, engine)


df = df.dropna(subset=['campaign_name'])


insert_stmt = session.prepare("""
    INSERT INTO campaign_performance (campaign_name, total_impressions, total_clicks, ctr)
    VALUES (?, ?, ?, ?)
""")


for _, row in df.iterrows():
    try:
        session.execute(insert_stmt, (
            row['campaign_name'],
            int(row['total_impressions']),
            int(row['total_clicks']),
            float(row['ctr'])
        ))
    except Exception as e:
        print(f"Помилка при вставці рядка {row.to_dict()}: {e}")

print("Дані успішно імпортовано у campaign_performance")


Дані успішно імпортовано у campaign_performance


Top advertisers by total ad spend

In [3]:

query = """
    SELECT
         DATE_FORMAT(`timestamp`, '%Y-%m')  AS mnth,
         ad.name AS advertiser_name,
         SUM(ad_cost) AS total_spend
    FROM analytics_db.dim_campaign AS cam
    LEFT JOIN (SELECT * FROM analytics_db.fact_event WHERE was_clicked = 1) AS fe
        ON cam.campaign_id = fe.campaign_id
    LEFT JOIN analytics_db.dim_advertiser AS ad
        ON cam.advertiser_id = ad.advertiser_id
    GROUP BY
         mnth,
         ad.name
"""
df = pd.read_sql(query, engine)


df = df.dropna(subset=['mnth', 'advertiser_name', 'total_spend'])


insert_stmt = session.prepare("""
    INSERT INTO advertisers (mnth, advertiser_name, total_spend)
    VALUES (?, ?, ?)
""")


for _, row in df.iterrows():
    try:
        session.execute(insert_stmt, (
            row['mnth'],
            row['advertiser_name'],
            float(row['total_spend'])
        ))
    except Exception as e:
        print(f" Помилка в рядку {row.to_dict()}: {e}")

print("Дані успішно імпортовано у таблицю advertisers.")


Дані успішно імпортовано у таблицю advertisers.


User engagement history (ads seen, clicked, and timestamps)

In [4]:
query = """
SELECT
    DATE(`timestamp`) AS date_event,
    user_id,
    ad_slot_id,
    was_clicked
FROM analytics_db.fact_event
"""


df = pd.read_sql(query, engine)


df = df.dropna(subset=['user_id', 'date_event', 'ad_slot_id'])


df['user_id'] = df['user_id'].apply(lambda x: uuid.uuid5(uuid.NAMESPACE_DNS, f"user_{int(x)}"))
df['ad_slot_id'] = df['ad_slot_id'].apply(lambda x: uuid.uuid5(uuid.NAMESPACE_DNS, f"ad_{int(x)}"))
df['was_clicked'] = df['was_clicked'].astype(bool)


insert_stmt = session.prepare("""
    INSERT INTO user_history (user_id, date_event, ad_id, was_clicked)
    VALUES (?, ?, ?, ?)
""")


for _, row in df.iterrows():
    try:
        session.execute(insert_stmt, (
            row['user_id'],
            row['date_event'],
            row['ad_slot_id'],
            row['was_clicked']
        ))
    except Exception as e:
        print(f"Помилка у рядку {row.to_dict()}: {e}")

print("Дані успішно імпортовано у user_history")

Дані успішно імпортовано у user_history


Most active users 

In [5]:

query = """
    SELECT
        DATE_FORMAT(`timestamp`, '%Y-%m')  AS mnth,  
        user_id,
        COUNT(event_id) AS total_clicks
    FROM analytics_db.fact_event 
    WHERE was_clicked = 1
    GROUP BY
        mnth,
        user_id
"""


df = pd.read_sql(query, engine)


df = df.dropna(subset=['user_id', 'mnth'])


df['user_id'] = df['user_id'].apply(lambda x: uuid.uuid5(uuid.NAMESPACE_DNS, f"user_{int(x)}"))


insert_stmt = session.prepare("""
    INSERT INTO most_active_users (mnth, user_id, total_clicks)
    VALUES (?, ?, ?)
""")


for _, row in df.iterrows():
    try:
        session.execute(insert_stmt, (
            row['mnth'],
            row['user_id'],
            int(row['total_clicks'])
        ))
    except Exception as e:
        print(f"Помилка для рядка {row.to_dict()}: {e}")

print("Дані успішно імпортовано у most_active_users")


Дані успішно імпортовано у most_active_users


High-spending advertisers by Region

In [6]:

query = """
    SELECT
        DATE_FORMAT(`timestamp`, '%Y-%m')  AS mnth,
        ad.name AS advertiser_name,
        tg.country,
        SUM(ad_cost) AS total_spend
    FROM analytics_db.dim_campaign AS cam
    LEFT JOIN analytics_db.fact_event AS fe
        ON cam.campaign_id = fe.campaign_id
    LEFT JOIN analytics_db.dim_advertiser AS ad
        ON cam.advertiser_id = ad.advertiser_id
    LEFT JOIN analytics_db.dim_targeting AS tg
        ON fe.campaign_id = tg.campaign_id
    GROUP BY
        mnth,
        ad.name,
        tg.country
"""


df = pd.read_sql(query, engine)


df = df.dropna(subset=['mnth', 'advertiser_name', 'country', 'total_spend'])


insert_stmt = session.prepare("""
    INSERT INTO advertisers_by_regions (mnth, advertiser_name, country, total_spend)
    VALUES (?, ?, ?, ?)
""")


for _, row in df.iterrows():
    try:
        session.execute(insert_stmt, (
            row['mnth'],
            row['advertiser_name'],
            row['country'],
            float(row['total_spend'])
        ))
    except Exception as e:
        print(f"Помилка при вставці рядка {row.to_dict()}: {e}")

print("Дані успішно імпортовано у advertisers_by_regions")


Дані успішно імпортовано у advertisers_by_regions


CQL queries

Query 1: CTR per campaign per day

In [7]:
query1 = """
SELECT campaign_name, total_impressions, total_clicks, ctr
FROM campaign_performance;
"""

rows = session.execute(query1)
print("CTR per campaign per day:")
for row in rows:
    print(row.campaign_name, row.total_impressions, row.total_clicks, round(row.ctr, 4))


CTR per campaign per day:
Campaign_369 136 2 0.0147
Campaign_244 277 13 0.0469
Campaign_873 436 28 0.0642
Campaign_17 166 6 0.0361
Campaign_416 120 3 0.025
Campaign_785 152 7 0.0461
Campaign_216 536 29 0.0541
Campaign_389 150 7 0.0467
Campaign_133 190 10 0.0526
Campaign_264 142 7 0.0493
Campaign_339 285 10 0.0351
Campaign_363 378 15 0.0397
Campaign_666 332 16 0.0482
Campaign_55 135 8 0.0593
Campaign_482 141 6 0.0426
Campaign_763 155 2 0.0129
Campaign_92 569 34 0.0598
Campaign_100 150 11 0.0733
Campaign_618 111 4 0.036
Campaign_282 152 7 0.0461
Campaign_1007 301 11 0.0365
Campaign_778 245 9 0.0367
Campaign_914 161 5 0.0311
Campaign_426 157 4 0.0255
Campaign_891 163 10 0.0613
Campaign_255 126 6 0.0476
Campaign_269 150 11 0.0733
Campaign_751 154 8 0.0519
Campaign_179 181 10 0.0552
Campaign_800 137 7 0.0511
Campaign_866 137 8 0.0584
Campaign_579 127 6 0.0472
Campaign_48 491 19 0.0387
Campaign_86 118 4 0.0339
Campaign_610 317 19 0.0599
Campaign_396 489 28 0.0573
Campaign_526 151 6 0.0397
Ca

Query 2: Top 5 advertisers by total ad spend in the past 30 days

In [8]:
query2 = """
SELECT mnth, advertiser_name, total_spend
FROM advertisers
WHERE mnth = '2024-12'
ORDER BY total_spend DESC
LIMIT 5;
"""

rows = session.execute(query2)
print("\nTop 5 advertisers in Dec 2024:")
for row in rows:
    print(row.advertiser_name, row.total_spend)



Top 5 advertisers in Dec 2024:
Advertiser_58 267.38
Advertiser_10 204.65
Advertiser_68 174.99
Advertiser_30 165.54
Advertiser_32 163.8


Query 3: Last 10 ads seen by user 465518 in the past 30 days

In [12]:

user_uuid = uuid.uuid5(uuid.NAMESPACE_DNS, f"user_{465518}")


query3 = f"""
SELECT date_event, ad_id, was_clicked
FROM user_history
WHERE user_id = {user_uuid}
  AND date_event >= '2024-12-01' AND date_event <= '2024-12-30'
ORDER BY date_event DESC
LIMIT 10
ALLOW FILTERING;
"""


rows = session.execute(query3)

# Вивід результатів
print(f"\nLast 10 ads seen by user {user_uuid} in the past 30 days:")
for row in rows:
    print(row.date_event, row.ad_id, row.was_clicked)



Last 10 ads seen by user 74a298f8-3ed7-59d9-aec7-468b4eedebbf in the past 30 days:
2024-12-30 d4795a0d-cdb3-5e66-9c82-a4f0e5e3f166 True


Query 4: Top 10 users with most clicks in the last 30 days

In [13]:
query4 = """
SELECT mnth, user_id, total_clicks
FROM most_active_users
WHERE mnth = '2024-12'
ORDER BY total_clicks DESC
LIMIT 10;
"""

rows = session.execute(query4)
print("\nTop 10 users by clicks in the last 30 days:")
for row in rows:
    print(row.user_id, row.total_clicks)



Top 10 users by clicks in the last 30 days:
0d7aec6a-782f-544e-88df-ebce0c62110d 2
0f539266-faea-505c-889b-e096462e6502 2
661bca3b-f559-53ce-baf1-3a100a1917cd 2
68d4e3e4-c0ed-5fe2-be87-164334276272 2
7ce51918-9453-5dc0-a2fe-ad865591c877 2
99eee6c1-1170-5fb2-8ec3-2d7f66ea2bfb 2
be5b9e9c-e95b-5a40-a7f3-f7ee1eecab5f 2
bfd679ab-178d-5e24-bbe4-d0872da8bb30 2
d6f5201b-98ea-5fd4-bce0-f70ec9c8d9d3 2
ddc8c365-4c0b-5b57-a815-8c0cd53f8e1e 2


Query 5: Top 5 advertisers by spend in USA over the last 30 days

In [14]:
query5 = """
SELECT mnth, advertiser_name, country, total_spend
FROM advertisers_by_regions
WHERE mnth = '2024-12' AND country = 'USA'
ORDER BY total_spend DESC
LIMIT 5;
"""

rows = session.execute(query5)
print("\nTop 5 advertisers in USA over the last 30 days:")
for row in rows:
    print(row.advertiser_name, row.total_spend)



Top 5 advertisers in USA over the last 30 days:
