# Explore downloaded season event data

In [None]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import numpy as np
# tqdm
from tqdm import tqdm
from glob import glob

In [None]:
columns_to_keep = [
    # "clientId",
    # "clientType",
    "fixtureId",
    "organizationId",
    # "received",
    # "sport",
    # "topic",
    "type",
    "data.class",
    "data.eventId",
    "data.eventTime",
    "data.eventType",
    # "data.options.attendance",
    # "data.options.numberOfPeriods",
    # "data.options.periodLength",
    # "data.status",
    "data.subType",
    "data.timestamp",
    "data.entityId",
    # "data.options.active",
    "data.options.bib",
    # "data.options.captain",
    # "data.options.name",
    "data.options.position",
    "data.options.starter",
    # "data.personId",
    # "data.options.number",
    "data.periodId",
    "data.sequence",
    "score_home",
    "score_away",
    "data.playId",
    "data.clock",
    "data.options.goalKeeperId",
    "data.options.location",
    "data.success",
    "data.x",
    "data.y",
    "data.options.failureReason",
    "data.options.attackType",
    "data.options.value",
    "data.options.emptyNet",
    # "team.added",
    # "team.ageGroup",
    # "team.alternateVenueIds",
    # "team.codeLatin",
    "team.codeLocal",
    # "team.defaultVenueId",
    # "team.discipline",
    # "team.entityGroupId",
    "team.entityId",
    "team.externalId",
    # "team.gender",
    # "team.grade",
    # "team.historicalNames",
    # "team.internationalReference",
    # "team.nameFullLatin",
    "team.nameFullLocal",
    # "team.organizationId",
    # "team.representing",
    # "team.standard",
    # "team.status",
    # "team.updated",
    # "team.additionalNames.namePlaceLatin",
    # "team.additionalNames.namePlaceLocal",
    # "team.additionalNames.nameShortLatin",
    # "team.additionalNames.nameShortLocal",
    "team.colors.primary",
    "team.colors.secondary",
    "team.colors.tertiary",
    # "team.contacts.email",
    # "team.contacts.fax",
    # "team.contacts.phone",
    # "team.entityGroup.id",
    # "team.entityGroup.resourceType",
    # "team.organization.id",
    # "team.organization.resourceType",
    # "player.added",
    # "player.deceased",
    "player.dob",
    "player.externalId",
    "player.gender",
    # "player.historicalNames",
    # "player.languageLocal",
    # "player.nameAbbreviated",
    # "player.nameFamilyLatin",
    "player.nameFamilyLocal",
    # "player.nameFullLatin",
    # "player.nameFullLocal",
    # "player.nameGivenLatin",
    "player.nameGivenLocal",
    "player.nationality",
    # "player.organizationId",
    "player.personId",
    # "player.representing",
    # "player.status",
    # "player.updated",
    "player.additionalDetails.height",
    "player.additionalDetails.weight",
    # "player.organization.id",
    # "player.organization.resourceType",
    "team_home_abbr",
    "team_away_abbr",
    "team_home_id",
    "team_away_id",
    "team_home_name",
    "team_away_name",
    "gameday",
    "team_attacking_id",
    "team_attacking_name",
    "team_attacking_side",
]

In [None]:
df = pd.DataFrame()



for file in tqdm(glob("../data/fixtures/*.csv", recursive=True), desc="Processing files"):
    df_temp = pd.read_csv(file)
    df_temp["source"] = os.path.basename(file).split(".")[0]
    df = pd.concat([df, df_temp], ignore_index=True)


In [None]:
display(df.columns)

In [None]:
# Histogram of 'data.eventType' and 'data.subType'
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
df["data.eventType"].value_counts().plot(kind="barh", ax=ax[0])
df["data.subType"].value_counts().plot(kind="barh", ax=ax[1])
ax[0].set_title("data.eventType")
ax[1].set_title("data.subType")
ax[0].set_ylabel("Event Type")
ax[1].set_ylabel("Sub Type")
ax[0].set_xlabel("Count")
ax[1].set_xlabel("Count")
plt.tight_layout()
# plt.savefig("eventType_subType.png")
plt.show()

In [None]:
# Display unique values of fixtureId per gameday
def unique_fixtureid_per_gameday(df):
    fixtureid_counts = df.groupby("gameday")["fixtureId"].nunique()
    return fixtureid_counts

fixtureid_counts = unique_fixtureid_per_gameday(df)
fixtureid_counts = fixtureid_counts.reset_index()
fixtureid_counts.columns = ["data.gameday", "unique_fixtureid"]

# Plot using Matplotlib
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(fixtureid_counts["data.gameday"], fixtureid_counts["unique_fixtureid"], color="skyblue")
ax.set_title("Unique Fixture ID per Gameday", fontsize=14)
ax.set_xlabel("Gameday", fontsize=12)
ax.set_ylabel("Unique Fixture ID", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import plotly.graph_objects as go

# Count occurrences
counts = df.groupby(["data.class", "data.eventType", "data.subType"]).size().reset_index(name="count")

# Collect all unique labels
labels = pd.concat([counts["data.class"], counts["data.eventType"], counts["data.subType"]]).unique().tolist()
label_indices = {label: i for i, label in enumerate(labels)}

# First flow: type → eventType
sources_type_event = counts["data.class"].map(label_indices)
targets_type_event = counts["data.eventType"].map(label_indices)
values_type_event = counts["count"]

# Second flow: eventType → subType
sources_event_sub = counts["data.eventType"].map(label_indices)
targets_event_sub = counts["data.subType"].map(label_indices)
values_event_sub = counts["count"]

# Combine both flows
sources = pd.concat([sources_type_event, sources_event_sub])
targets = pd.concat([targets_type_event, targets_event_sub])
values = pd.concat([values_type_event, values_event_sub])

# Plot Sankey
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        label=labels,
        line=dict(color="black", width=0.5)
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
)])

fig.update_layout(
    title_text="Type → EventType → SubType Flow",
    font_size=10,
    height=800,
    width=700
)
fig.show()


In [None]:
import plotly.express as px

df_filtered = df[["data.class", "data.eventType", "data.subType"]].dropna()


fig = px.sunburst(
    df_filtered,
    path=["data.class", "data.eventType", "data.subType"],
    values=None,  # or use a 'count' column if pre-aggregated
    title="Class → EventType → SubType Hierarchy"
)
fig.update_layout(margin=dict(t=40, l=10, r=10, b=10))
fig.show()

fig = px.treemap(
    df_filtered,
    path=["data.class", "data.eventType", "data.subType"],
    values=None,  # or 'count'
    title="Class → EventType → SubType Treemap"
)
fig.update_layout(margin=dict(t=40, l=10, r=10, b=10))
fig.show()


fig = px.icicle(
    df_filtered,
    path=["data.class", "data.eventType", "data.subType"],
    values=None,
    title="Class → EventType → SubType Icicle"
)
fig.update_layout(margin=dict(t=40, l=10, r=10, b=10))
fig.show()



In [None]:
# Replace NaN values in 'data.success' with False (or any default value)
df["data.success"] = df["data.success"].fillna(False)

# Assign colors based on data.success
colors = df["data.success"].map({True: 'green', False: 'red'})

# Plot data.x and data.y as a scatter plot with colors based on data.success
plt.figure(figsize=(10, 6))
plt.scatter(df["data.x"], df["data.y"], alpha=0.5, s=10, c=colors)
plt.title("Scatter Plot of data.x vs data.y")
plt.xlabel("data.x (scaled from 0 to 100)")
plt.ylabel("data.y (scaled from 0 to 100)")
plt.xlim(0, 100)
plt.ylim(0, 100)
plt.grid()
plt.tight_layout()
plt.show()