In [12]:
import csv
import math

# Load your CSV dataset
event_ids = set()

with open('cleaned_full_dataset.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if 'event_id' in row:
            event_ids.add(row['event_id'])

# Convert to sorted list
unique_event_ids = sorted(event_ids)
total_ids = len(unique_event_ids)

# Split into chunks of 4000
chunk_size = 2000
num_chunks = math.ceil(total_ids / chunk_size)

for i in range(num_chunks):
    chunk = unique_event_ids[i * chunk_size:(i + 1) * chunk_size]
    filename = f'unique_event_ids_part_{i+1}.txt'
    with open(filename, 'w', encoding='utf-8') as f:
        for eid in chunk:
            f.write(f"{eid}\n")
    print(f"Saved {len(chunk)} event IDs to {filename}")

print(f"Total unique event IDs: {total_ids}")
print(f"Total text files created: {num_chunks}")


Saved 2000 event IDs to unique_event_ids_part_1.txt
Saved 2000 event IDs to unique_event_ids_part_2.txt
Saved 2000 event IDs to unique_event_ids_part_3.txt
Saved 2000 event IDs to unique_event_ids_part_4.txt
Saved 2000 event IDs to unique_event_ids_part_5.txt
Saved 2000 event IDs to unique_event_ids_part_6.txt
Saved 2000 event IDs to unique_event_ids_part_7.txt
Saved 2000 event IDs to unique_event_ids_part_8.txt
Saved 2000 event IDs to unique_event_ids_part_9.txt
Saved 2000 event IDs to unique_event_ids_part_10.txt
Saved 2000 event IDs to unique_event_ids_part_11.txt
Saved 2000 event IDs to unique_event_ids_part_12.txt
Saved 2000 event IDs to unique_event_ids_part_13.txt
Saved 2000 event IDs to unique_event_ids_part_14.txt
Saved 2000 event IDs to unique_event_ids_part_15.txt
Saved 2000 event IDs to unique_event_ids_part_16.txt
Saved 2000 event IDs to unique_event_ids_part_17.txt
Saved 2000 event IDs to unique_event_ids_part_18.txt
Saved 2000 event IDs to unique_event_ids_part_19.txt
Sa

In [1]:
# goals_only_scraper.py ----------------------------------------------------
"""
Lit des event_id depuis EVENT_FILE, récupère les incidents de type 'goal',
et écrit un JSON-Lines : une ligne = {event_id, goals:[…]}.
"""
import json, pathlib, requests, time

# ─── CONFIG ───────────────────────────────────────────────────────────────
EVENT_FILE = "unique_event_ids_part_17.txt"   # ton fichier d'IDs
OUT_FILE   = pathlib.Path("event_goals17.json")        # sortie
HEADERS    = {"User-Agent": "Mozilla/5.0"}
DELAY_S    = 0.3                                      # throttle API
# ──────────────────────────────────────────────────────────────────────────

def read_event_ids(path: str) -> list[int]:
    with open(path, encoding="utf-8") as f:
        return [int(l.strip()) for l in f if l.strip().isdigit()]

# ------------------------------------------------------------------------
def fetch_goals(event_id: int) -> list[dict]:
    """Renvoie la liste des buts pour un match (vide s’il n’y en a pas)."""
    url = f"http://api.sofascore.com/api/v1/event/{event_id}/incidents"
    r = requests.get(url, headers=HEADERS, timeout=20)
    r.raise_for_status()

    goals = []
    for inc in r.json().get("incidents", []):
        if inc.get("incidentType") != "goal":
            continue

        scorer_obj = inc.get("player") or {}
        assists = [
            {"id": a.get("id"), "name": a.get("name") or a.get("shortName")}
            for akey in ("assist1", "assist2")
            if (a := inc.get(akey))
        ]

        goals.append(
            {
                "incident_id": inc.get("id"),
                "minute":      inc.get("time"),
                "is_home":     inc.get("isHome"),
                "home_score":  inc.get("homeScore"),
                "away_score":  inc.get("awayScore"),
                "scorer": {
                    "id":   scorer_obj.get("id"),
                    "name": scorer_obj.get("name") or scorer_obj.get("shortName"),
                },
                "assists":   assists,            # liste vide, 1 ou 2 éléments
                "goal_type": inc.get("goalType", "regular"),
            }
        )
    return goals

# ------------------------------------------------------------------------
def main():
    ids = read_event_ids(EVENT_FILE)
    print(f"Fetching goals for {len(ids)} events…")

    with OUT_FILE.open("w", encoding="utf-8") as out:
        for eid in ids:
            try:
                goals = fetch_goals(eid)
                line  = {"event_id": eid, "goals": goals}
                out.write(json.dumps(line, ensure_ascii=False) + "\n")
                print(f"✓ {eid}: {len(goals)} goals")
            except Exception as exc:
                print(f"⚠ {eid}: {exc}")
            time.sleep(DELAY_S)

    print(f"\n✅ Fichier généré → {OUT_FILE.resolve()}")

# ─── EXECUTION ────────────────────────────────────────────────────────────
if __name__ == "__main__":
    main()


Fetching goals for 2000 events…
✓ 235443: 0 goals
✓ 235444: 0 goals
✓ 235735: 0 goals
✓ 236428: 0 goals
✓ 2372341: 0 goals
✓ 2372353: 0 goals
✓ 2372355: 0 goals
✓ 2372357: 0 goals
✓ 2385013: 0 goals
✓ 2387029: 0 goals
✓ 2387031: 0 goals
✓ 2387033: 0 goals
✓ 2389475: 0 goals
✓ 2391661: 0 goals
✓ 2391689: 0 goals
✓ 2391699: 0 goals
✓ 2392083: 0 goals
✓ 2392085: 0 goals
✓ 2392223: 0 goals
✓ 2393393: 0 goals
✓ 239376: 0 goals
✓ 239381: 0 goals
✓ 239382: 0 goals
✓ 2394017: 0 goals
✓ 2395162: 0 goals
✓ 2395164: 0 goals
✓ 2404024: 0 goals
✓ 2404026: 0 goals
✓ 2404028: 0 goals
✓ 2405994: 0 goals
✓ 2405996: 0 goals
✓ 2407762: 0 goals
✓ 2407764: 0 goals
✓ 2407766: 0 goals
✓ 2407770: 0 goals
✓ 2412784: 0 goals
✓ 2412844: 0 goals
✓ 2416728: 0 goals
✓ 2417564: 0 goals
✓ 2417566: 0 goals
✓ 2421188: 0 goals
✓ 2430396: 0 goals
✓ 2436896: 0 goals
✓ 2436898: 0 goals
✓ 2436900: 0 goals
✓ 2436902: 0 goals
✓ 2436942: 0 goals
✓ 2436944: 0 goals
✓ 2436946: 0 goals
✓ 2436948: 0 goals
✓ 2436952: 0 goals
✓ 2436