In [None]:
# STEP 13 — Daily conversation spikes + closest event matching (Option B: from topics_per_doc.csv)

import pandas as pd
from pathlib import Path
from IPython.display import display


# Config

TOPICS_PER_DOC_PATH = Path("topic0_per_doc_filtered.csv")   # <-- change path if needed
EVENT_PATH          = Path("timeline_FULL_Done.csv")       # <--  event file

DATE_MIN = "2021-01-01"
DATE_MAX = "2025-09-01"

ZOOM_WINDOWS = [
    ("2021-04-01", "2021-08-31", "2021_window"),
    ("2023-10-01", "2025-09-01", "post_2023_window"),
]


N_SPIKES_PER_WINDOW = 10  # biggest spike days per window

# restrict spikes to one topic. Keep None for all conversation.
FILTER_TOPIC = None  # e.g. 0

# only keep matches where closest event is within +/- X days (None = keep all)
MAX_EVENT_DISTANCE_DAYS = None 

OUT_DIR = Path("results")
OUT_DIR.mkdir(parents=True, exist_ok=True)


# Load topics_per_doc and prep daily counts
tpd = pd.read_csv(TOPICS_PER_DOC_PATH, sep=";")
tpd.columns = tpd.columns.str.strip()

tpd["dt"] = pd.to_datetime(tpd["dt"], errors="coerce", utc=True)
tpd = tpd.dropna(subset=["dt"])
tpd["dt"] = tpd["dt"].dt.tz_convert("UTC").dt.tz_localize(None)

tpd["date"] = tpd["dt"].dt.floor("D")

tpd = tpd[
    (tpd["date"] >= pd.to_datetime(DATE_MIN)) &
    (tpd["date"] <= pd.to_datetime(DATE_MAX))
].copy()

if FILTER_TOPIC is not None:
    tpd = tpd[tpd["topic"] == FILTER_TOPIC].copy()

daily_totals = (
    tpd.groupby("date", as_index=False)
       .size()
       .rename(columns={"size": "count"})
).sort_values("date")

day_topic_counts = (
    tpd.groupby(["date", "topic"], as_index=False)
       .size()
       .rename(columns={"size": "topic_count"})
)

dominant = (
    day_topic_counts.sort_values(["date", "topic_count"], ascending=[True, False])
                    .drop_duplicates(subset=["date"], keep="first")
                    .rename(columns={"topic": "dominant_topic", "topic_count": "dominant_topic_count"})
)

topic_name_map = (
    tpd.groupby("topic")["topic_name"]
       .agg(lambda s: s.dropna().astype(str).mode().iloc[0] if len(s.dropna()) else "")
       .to_dict()
)
dominant["dominant_topic_name"] = dominant["dominant_topic"].map(topic_name_map)

daily_df = daily_totals.merge(dominant, on="date", how="left")
daily_df["dominant_topic_share"] = (daily_df["dominant_topic_count"] / daily_df["count"]).round(4)

print("Daily summary preview:")
display(daily_df.head())


# Load timeline events

events = pd.read_csv(EVENT_PATH, sep=";", encoding="cp1252")
events.columns = events.columns.str.strip()

events["event_date"] = pd.to_datetime(events["event_date"], dayfirst=True, errors="coerce", utc=True)
events = events.dropna(subset=["event_date"])
events["event_date"] = events["event_date"].dt.tz_convert("UTC").dt.tz_localize(None)

events = events[["event_date", "event"]].sort_values("event_date").reset_index(drop=True)

print("\nTimeline preview:")
display(events.head())

# For each window: top spikes + match to latest event <= spike day

for start_date, end_date, label in ZOOM_WINDOWS:
    start_dt = pd.to_datetime(start_date)
    end_dt   = pd.to_datetime(end_date)

    zoom = daily_df[(daily_df["date"] >= start_dt) & (daily_df["date"] <= end_dt)].copy()

    spikes = (
        zoom.sort_values("count", ascending=False)
            .head(N_SPIKES_PER_WINDOW)
            .sort_values("date")
            .reset_index(drop=True)
    )

    matched_rows = []
    for _, row in spikes.iterrows():
        spike_day = pd.to_datetime(row["date"]).normalize()

        # ONLY consider events on/before the spike day
        past_events = events[events["event_date"].dt.normalize() <= spike_day]

        if len(past_events) == 0:
            # No earlier events available 
            closest_event_date = ""
            closest_event_text = ""
            day_diff_out = ""
        else:
            # Pick the most recent event (latest date <= spike day)
            closest = past_events.iloc[-1]
            event_day = closest["event_date"].normalize()
            day_diff = int((event_day - spike_day).days)  # 0 or negative only

            # distance filter
            if MAX_EVENT_DISTANCE_DAYS is not None and abs(day_diff) > MAX_EVENT_DISTANCE_DAYS:
                closest_event_date = ""
                closest_event_text = ""
                day_diff_out = ""
            else:
                closest_event_date = closest["event_date"].date()
                closest_event_text = closest["event"]
                day_diff_out = day_diff

        matched_rows.append({
            "date": spike_day.date(),
            "count": int(row["count"]),
            "dominant_topic": int(row["dominant_topic"]) if pd.notna(row["dominant_topic"]) else "",
            "dominant_topic_name": row["dominant_topic_name"],
            "dominant_topic_count": int(row["dominant_topic_count"]) if pd.notna(row["dominant_topic_count"]) else "",
            "dominant_topic_share": float(row["dominant_topic_share"]) if pd.notna(row["dominant_topic_share"]) else "",
            "matched_event_date": closest_event_date,
            "matched_event": closest_event_text,
            "days_event_minus_spike": day_diff_out,  # now 0 or negative only
        })

    out_df = pd.DataFrame(matched_rows)

    print(f"\n=== Top {N_SPIKES_PER_WINDOW} DAILY spikes + event (past-only) — {label} ===")
    display(out_df)

    if label == "2021_window":
        filename = "top_10_daily_spikes_with_events_pastonly_2021_04_to_2021_08.csv"
    else:
        filename = "top_10_daily_spikes_with_events_pastonly_2023_10_to_2025_09.csv"

    out_path = OUT_DIR / filename
    out_df.to_csv(out_path, index=False, sep=";", encoding="utf-8-sig")
    print(f"Saved → {out_path.resolve()}")

Daily summary preview:


Unnamed: 0,date,count,dominant_topic,dominant_topic_count,dominant_topic_name,dominant_topic_share
0,2021-01-13,9,0,8,0_hamas_gaza_genocide_palestinians,0.8889
1,2021-01-14,5,0,4,0_hamas_gaza_genocide_palestinians,0.8
2,2021-01-15,4,4,2,4_idf_soldiers_civilians_moral,0.5
3,2021-01-20,3,5,1,5_racist_racism_white_black,0.3333
4,2021-01-21,1,0,1,0_hamas_gaza_genocide_palestinians,1.0



Timeline preview:


Unnamed: 0,event_date,event
0,2020-01-01,"On 1 January 2020, Israeli forces opened fire ..."
1,2020-01-02,"On 2 January 2020, Israeli police forces assau..."
2,2020-01-06,"On 6 January 2020, Israeli forces opened fire ..."
3,2020-01-12,"On 12 January 2020, two men were seriously wou..."
4,2020-01-14,"On 14 January 2020, an Israeli settler ran ove..."



=== Top 10 DAILY spikes + event (past-only) — 2021_window ===


Unnamed: 0,date,count,dominant_topic,dominant_topic_name,dominant_topic_count,dominant_topic_share,matched_event_date,matched_event,days_event_minus_spike
0,2021-05-12,214,0,0_hamas_gaza_genocide_palestinians,125,0.5841,2021-05-12,"On 12 May 2021, Israeli aircraft have struck t...",0
1,2021-05-13,140,0,0_hamas_gaza_genocide_palestinians,100,0.7143,2021-05-13,"On 13 May 2021, Palestinian militants in the G...",0
2,2021-05-15,125,0,0_hamas_gaza_genocide_palestinians,83,0.664,2021-05-15,"On 15 May 2021, a rocket launched by Hamas, pa...",0
3,2021-05-16,194,0,0_hamas_gaza_genocide_palestinians,130,0.6701,2021-05-16,"On 16 May 2021, the Israeli forces opened fire...",0
4,2021-05-17,230,0,0_hamas_gaza_genocide_palestinians,150,0.6522,2021-05-17,"On 17 May 2021, Hamas targeted the southern Is...",0
5,2021-05-18,186,0,0_hamas_gaza_genocide_palestinians,144,0.7742,2021-05-18,"On 18 May 2021, a barrage of rockets fired by ...",0
6,2021-05-21,183,0,0_hamas_gaza_genocide_palestinians,124,0.6776,2021-05-21,"On 21 May 2021, Israeli settlers, under the pr...",0
7,2021-05-26,139,0,0_hamas_gaza_genocide_palestinians,90,0.6475,2021-05-26,"On 26 May 2021, an incendiary balloon that had...",0
8,2021-05-30,141,0,0_hamas_gaza_genocide_palestinians,85,0.6028,2021-05-30,"On 30 May 2021, a Palestinian from Gaza Strip,...",0
9,2021-06-09,140,0,0_hamas_gaza_genocide_palestinians,101,0.7214,2021-06-09,"On 9 June 2021, unidentified gunmen shot sever...",0


Saved → \\SAXOLAPXXXX\Users\Matilde\Desktop\Thesis_project_2.0_EDIT\results\top_10_daily_spikes_with_events_pastonly_2021_04_to_2021_08.csv

=== Top 10 DAILY spikes + event (past-only) — post_2023_window ===


Unnamed: 0,date,count,dominant_topic,dominant_topic_name,dominant_topic_count,dominant_topic_share,matched_event_date,matched_event,days_event_minus_spike
0,2023-10-18,768,0,0_hamas_gaza_genocide_palestinians,575,0.7487,2023-10-18,"On 18 October 2023, Palestinian health authori...",0
1,2023-10-30,705,0,0_hamas_gaza_genocide_palestinians,494,0.7007,2023-10-30,"On 30 October 2023, UNICEF’s chief warned of a...",0
2,2025-07-30,761,0,0_hamas_gaza_genocide_palestinians,577,0.7582,2025-07-28,IDF says troops uncovered weapons and tunnel n...,-2
3,2025-08-04,696,0,0_hamas_gaza_genocide_palestinians,515,0.7399,2025-08-04,Dozens killed as Palestinians scramble for aid...,0
4,2025-08-07,719,0,0_hamas_gaza_genocide_palestinians,532,0.7399,2025-08-04,Dozens killed as Palestinians scramble for aid...,-3
5,2025-08-19,698,0,0_hamas_gaza_genocide_palestinians,502,0.7192,2025-08-04,Dozens killed as Palestinians scramble for aid...,-15
6,2025-08-20,770,0,0_hamas_gaza_genocide_palestinians,538,0.6987,2025-08-04,Dozens killed as Palestinians scramble for aid...,-16
7,2025-08-22,785,0,0_hamas_gaza_genocide_palestinians,600,0.7643,2025-08-04,Dozens killed as Palestinians scramble for aid...,-18
8,2025-08-25,800,0,0_hamas_gaza_genocide_palestinians,570,0.7125,2025-08-04,Dozens killed as Palestinians scramble for aid...,-21
9,2025-08-26,720,0,0_hamas_gaza_genocide_palestinians,523,0.7264,2025-08-04,Dozens killed as Palestinians scramble for aid...,-22


Saved → \\SAXOLAPXXXX\Users\Matilde\Desktop\Thesis_project_2.0_EDIT\results\top_10_daily_spikes_with_events_pastonly_2023_10_to_2025_09.csv
