# Overview

We will create 4 visualizations about winners of the FIFA soccer World Cup.

In [201]:
# start with the setup

# supress warnings about future deprecations
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
pd.options.mode.chained_assignment = None

import altair as alt
import numpy as np
import pprint
import datetime as dt
from vega_datasets import data
import matplotlib.pyplot as plt

# Solve a javascript error by explicitly setting the renderer
alt.renderers.enable('jupyterlab')

RendererRegistry.enable('jupyterlab')

In [202]:
#load data
df1 = pd.read_csv("https://raw.githubusercontent.com/jfjelstul/worldcup/refs/heads/master/data-csv/goals.csv")
df2 = pd.read_csv("https://raw.githubusercontent.com/jfjelstul/worldcup/refs/heads/master/data-csv/tournaments.csv")
df3 = pd.read_csv("https://raw.githubusercontent.com/jfjelstul/worldcup/refs/heads/master/data-csv/matches.csv")

#Hint: The fields are described here: https://github.com/jfjelstul/worldcup/blob/master/codebook/csv/variables.csv

In [203]:
#Create List of World Cup (WC) winners

#Only consider WCs since 1950 and simplify DataFrame by removing and renaming columns
tournament_ids = ["WC-" + str(x) for x in range(1950,2023,4)] 
df_wc_winners = df2[df2["tournament_id"].isin(tournament_ids)][["tournament_id", "year", "winner"]].replace(['West Germany'],'Germany').reset_index(drop=True)

wc_winners = list(set(df_wc_winners["winner"].to_list()))
wc_winners.sort()

wc_winners

['Argentina',
 'Brazil',
 'England',
 'France',
 'Germany',
 'Italy',
 'Spain',
 'Uruguay']

In [204]:
#Simplify DataFrame on matches at WC tournaments by removing and renaming columns
match_cols = ["tournament_id", "match_id", "home_team_name","away_team_name","home_team_score","away_team_score", "home_team_win", "away_team_win", "draw"]
df4 = df3[df3["tournament_id"].isin(tournament_ids)][match_cols].replace(['West Germany'],'Germany')

In [205]:
#Prepare data for Part 1

#Create DataFrame with goals WC winners scored at WC matches since 1950
df5 = df4[df4["away_team_name"].isin(wc_winners)].rename(columns={'away_team_name': 'team', 'away_team_score': 'score'}).drop(columns=["home_team_name", "home_team_score"])
df6 = df4[df4["home_team_name"].isin(wc_winners)].rename(columns={'home_team_name': 'team', 'home_team_score': 'score'}).drop(columns=["away_team_name", "away_team_score"])
df7 = pd.concat([df5,df6]).sort_values(by=['tournament_id','match_id']).drop(columns=["home_team_win", "away_team_win", "draw"])
df_goals_winners_per_match = df7.merge(df_wc_winners, on="tournament_id")

df_goals_winners_per_match.head()

Unnamed: 0,tournament_id,match_id,team,score,year,winner
0,WC-1950,M-1950-01,Brazil,4,1950,Uruguay
1,WC-1950,M-1950-03,England,2,1950,Uruguay
2,WC-1950,M-1950-04,Spain,3,1950,Uruguay
3,WC-1950,M-1950-05,Italy,2,1950,Uruguay
4,WC-1950,M-1950-06,Brazil,2,1950,Uruguay


In [206]:
#Prepare data for Part 2

#Simplify DataFrame on WC matches by removing and renaming columns
df8 = df4[df4["away_team_name"].isin(wc_winners) & df4["home_team_name"].isin(wc_winners)].reset_index(drop=True)

#For the two halfs of the heatmap, copy and add mirrored match data (i.e. away and home team are reversed)
df_wc_winner_matchups = pd.concat([df8,df8.rename(columns={'home_team_name': 'away_team_name', 'away_team_name': 'home_team_name', 
                                                                             'home_team_score': 'away_team_score', 'away_team_score': 'home_team_score', 
                                                                             'home_team_win': 'away_team_win', 'away_team_win': 'home_team_win'})])
#Add a year field
df_wc_winner_matchups['year'] = pd.to_numeric(df_wc_winner_matchups['tournament_id'].str.replace("WC-", ''), downcast='integer', errors='coerce')

df_wc_winner_matchups.head()

Unnamed: 0,tournament_id,match_id,home_team_name,away_team_name,home_team_score,away_team_score,home_team_win,away_team_win,draw,year
0,WC-1950,M-1950-13,Spain,England,1,0,1,0,0,1950
1,WC-1950,M-1950-18,Uruguay,Spain,2,2,0,0,1,1950
2,WC-1950,M-1950-19,Brazil,Spain,6,1,1,0,0,1950
3,WC-1950,M-1950-22,Uruguay,Brazil,2,1,1,0,0,1950
4,WC-1954,M-1954-20,Uruguay,England,4,2,1,0,0,1954


In [207]:
#Prepare data for Part 3 and 4

#Simplify DataFrame so it lists at which minute teams scored a goal (own goals are ignored in this exercise)
df_goals = df1[["goal_id", "tournament_id", "match_id", "team_name", "minute_label"]]
df_goals['minute'] = pd.to_numeric(df_goals['minute_label'].str.replace("'", ''), downcast='integer', errors='coerce')
df_goals = df_goals.drop(columns=["minute_label"])

df_goals.head()

Unnamed: 0,goal_id,tournament_id,match_id,team_name,minute
0,G-0001,WC-1930,M-1930-01,France,19.0
1,G-0002,WC-1930,M-1930-01,France,40.0
2,G-0003,WC-1930,M-1930-01,France,43.0
3,G-0004,WC-1930,M-1930-01,Mexico,70.0
4,G-0005,WC-1930,M-1930-01,France,87.0


# Visualization 1: Goals of World Cup Winners since 1950

We will replicate the following visualization: <br>
<img src="https://raw.githubusercontent.com/grill/SI649-hw-interaction/main/line.png?raw=true" alt="drawing" width="500"/>

**Description of the visualization (interactivity):**
1. When hovering over bars, the associated average score will show up as tooltips. <br>
<img src="https://raw.githubusercontent.com/grill/SI649-hw-interaction/main/line_tooltip.gif?raw=true" alt="drawing" width="500"/>
2. Brushing over the bars will change the opacity of the bars.
3. Brushing over the bars will generate different average score value lines. <br>
<img src="https://raw.githubusercontent.com/grill/SI649-hw-interaction/main/line_hover.gif?raw=true" alt="drawing" width="500"/>

In [208]:
##TODO: replicate vis 1 
df_goals_winners_per_match_filtered = df_goals_winners_per_match[df_goals_winners_per_match['team'] == df_goals_winners_per_match['winner']]

##base chart (define calculated and aggregated fields needed across all charts)
barchart1_1 = alt.Chart(df_goals_winners_per_match_filtered).transform_aggregate(
    mean_goals='mean(score)',
    groupby=['winner', 'year'] 
).transform_calculate(
    winner_year='datum.year + " " + datum.winner'
).mark_bar(opacity=0.6, height=15).encode(
    alt.X('mean_goals:Q', axis=alt.Axis(labels=True, title=None)),
    alt.Y('winner_year:N', axis=alt.Axis(labels=True, title=None)),
    alt.Tooltip('mean_goals:Q', format='.2f')
).properties(
    title='Goals of World Cup Winner since 1950',
    height = 500,
    width = 700,
)

##Create selections
selection1 = alt.selection_interval(encodings=["y"])
condition1 = alt.condition(selection1, alt.value(1.0), alt.value(0.6))

##Static Component - Bars

##Static Component - Vertical Line
vline1_1 = (
    alt.Chart(df_goals_winners_per_match_filtered)
    .transform_calculate(
        winner_year='datum.year + " " + datum.winner'
    )
    .transform_filter(selection1)          # 只過濾被選到的資料
    .transform_aggregate(
        mean_score='mean(score)'
    )
    .mark_rule(size=3, color="firebrick")
    .encode(
        alt.X('mean_score:Q')
    )
)
##Static Component - Text

text1_1 = (
    vline1_1
    .mark_text(
        color='firebrick',
        fontSize=12,
        align='left',
        dx=7
    )
    .encode(
        alt.Text('mean_score:Q', format='.2f')
    )
)

##Put all together
barchart1_2 = barchart1_1.add_params(selection1).encode(
    opacity = condition1
)


final_viz1 = barchart1_2 + vline1_1 + text1_1
final_viz1

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


# Visualization 2: Matchups of World Cup winners

We will replicate the following visualization: <br>
<img src="https://raw.githubusercontent.com/grill/SI649-hw-interaction/main/heat.png?raw=true" alt="drawing" width="500"/>

**Description of the visualization (static):**
*   Use *df_wc_winner_matchups* for this exercise
*   This visualization has 2 components: **heatmap** and **text charts** 
*   Look in the example gallery for inspiration for how to build the 1st and 2nd component (especially in sections "Simple Charts", "Advanced Calculations" and "Case Studies"): https://altair-viz.github.io/gallery/

**Description of the visualization (interactivity):**
1. When brushing over colored boxes in the heatmap, the associated text will be filtered/updated. <br>
<img src="https://raw.githubusercontent.com/grill/SI649-hw-interaction/main/heat_interaction.gif?raw=true" alt="drawing" width="500"/>

In [209]:
soccer_ball = "⚽"

df = df_wc_winner_matchups.copy()
df["teamA"] = df[["home_team_name", "away_team_name"]].min(axis=1)
df["teamB"] = df[["home_team_name", "away_team_name"]].max(axis=1)

df["teamA_score"] = df.apply(lambda row: row["home_team_score"] if row["home_team_name"] == row["teamA"] else row["away_team_score"], axis=1)
df["teamB_score"] = df.apply(lambda row: row["home_team_score"] if row["home_team_name"] == row["teamB"] else row["away_team_score"], axis=1)

df["teamA_win"] = df.apply(lambda row: row["home_team_win"] if row["home_team_name"] == row["teamA"] else row["away_team_win"], axis=1)
df["teamB_win"] = df.apply(lambda row: row["home_team_win"] if row["home_team_name"] == row["teamB"] else row["away_team_win"], axis=1)

df = df.drop(columns=["home_team_name", "away_team_name", 
                      "home_team_score", "away_team_score", 
                      "home_team_win", "away_team_win", "draw"])

win_counts = df.groupby(["teamA", "teamB"], as_index=False).agg(
    TotalWin_A=("teamA_win", "sum"),
    TotalWin_B=("teamB_win", "sum")
)
win_counts["matchup"] = win_counts["teamA"] + " vs " + win_counts["teamB"]
win_counts["win_diff"] = win_counts["TotalWin_B"] - win_counts["TotalWin_A"]

win_counts_symmetric = win_counts.copy()
win_counts_symmetric = win_counts_symmetric.rename(columns={"teamA": "teamB", "teamB": "teamA"})
win_counts_symmetric["win_diff"] = -win_counts_symmetric["win_diff"]

win_counts_full = pd.concat([win_counts_symmetric, win_counts], ignore_index=True)

df = df.merge(win_counts, on=["teamA", "teamB"], how="left")
temp = df.copy()
temp = temp.rename(columns={"teamA": "teamB", "teamB": "teamA"})
temp = temp.rename(columns={"teamA_score": "teamB_score", "teamB_score": "teamA_score"})
temp = temp.rename(columns={"teamA_win": "teamB_win", "teamB_win": "teamA_win"})
temp["win_diff"] = temp["TotalWin_B"] - temp["TotalWin_A"]

df_full = pd.concat([df, temp], ignore_index=True)
df_full["match_info"] = df_full.apply(lambda row: f"{row['year']} ({row['teamA_score']} : {row['teamB_score']})", axis=1)

selection = alt.selection_point(
    fields=["teamA", "teamB"], 
    on = 'mouseover',
    empty="none",
)

heatmap = (
    alt.Chart(df_full)
    .mark_rect()
    .encode(
        x=alt.X('teamA:N', sort='x', title=None, axis=alt.Axis(labelAngle=-50)),  
        y=alt.Y('teamB:N', sort='y', title=None),  
        color=alt.Color(
            'win_diff:Q',
            title = 'win_ratio',
            scale=alt.Scale(domainMid=0, range=['red', 'white', 'green']),
            legend=alt.Legend(
                orient='none',
                titleAnchor='start',
                legendX=730,
                legendY=30,
            ),
        ),
        tooltip= [alt.Tooltip('matchup:N', title=None)],
    )
    .add_params(selection)
    .properties(width=700, height=300)
)

heatmap

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [210]:
Title = (
    alt.Chart(df_full)
    .mark_text(size=35, color="white", align="center", baseline="middle")
    .encode(
        text = alt.Text("matchup:N"),
        x=alt.value(350),
    )
    .properties(width=700,
                height = 50,) 
    .transform_filter(selection)
)

left_goals = (
    alt.Chart(df_full)
    .mark_text(text=soccer_ball, size=30, baseline="middle")
    .encode(
        x=alt.X("teamA_score:Q", scale=alt.Scale(domain=[0, 8], reverse=True), title=None, axis = alt.Axis(grid = True)),  
        y=alt.Y("match_id:N", title=None, axis=None),
    )
    .properties(width=300,
                height = 250) 
    .transform_filter(selection)
)

middle_text = (
    alt.Chart(df_full)
    .mark_text(size=12, color="white", align="center")
    .encode(
        y=alt.Y("match_id:N", axis=alt.Axis(domain=False, ticks=False, labels=False, title=None)),
        text=alt.Text("match_info:N") 
    )
    .properties(width=50,
                height = 250) 
    .transform_filter(selection)
)

right_goals = (
    alt.Chart(df_full)
    .mark_text(text=soccer_ball, size=30, baseline="middle")
    .encode(
        x=alt.X("teamB_score:Q", scale=alt.Scale(domain=[0, 8]), title=None, axis =alt.Axis(grid = True)), 
        y=alt.Y("match_id:N", title=None, axis=None),
    )
    .properties(width=300,
                height = 250) 
    .transform_filter(selection)
)

In [211]:
text_chart = (left_goals|middle_text|right_goals).resolve_scale(x='independent', y='shared')
text_chart =  alt.vconcat(
    Title,
    text_chart).resolve_scale(x='independent', y='independent')

final_viz2 = alt.vconcat(
    heatmap,
    text_chart
).resolve_scale(x='independent', y='independent').configure_view(strokeWidth=0)

final_viz2

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


# Visualization 3: Timing of World Cup Goals

We will replicate the following visualization: <br>
<img src="https://raw.githubusercontent.com/grill/SI649-hw-interaction/main/linegoal.png?raw=true" alt="drawing" width="500"/>

**Description of the visualization (static):**
*   Use *df_goals* for this exercise
*   This visualization has 4 components: **line chart**, **vertical line**, **points** and **texts** 

**Description of the visualization (interactive):**
1. Enable zooming and panning along the x-axis. (The gif below only displays the line chart.) <br>
<img src="https://raw.githubusercontent.com/grill/SI649-hw-interaction/main/linegoal_zoom.gif?raw=true" alt="drawing" width="500"/>
2. Display a vertical line that moves with the mouse. This will require you to add additional chart component(let's call it **vLine**). <br>
<img src="https://raw.githubusercontent.com/grill/SI649-hw-interaction/main/linegoal_moveline.gif?raw=true" alt="drawing" width="500"/>
3. Display the intersection of the **vLine** with the **line chart** as 1 circle (let's call this circle **intersection dot**). 
4. When hovering over this **intersection dot**, display *how many goals were scored this minute* in text label.   <br>
<img src="https://raw.githubusercontent.com/grill/SI649-hw-interaction/main/linegoal_points.gif?raw=true" alt="drawing" width="500"/>

In [223]:
#TODO: replicate vis3

df_minute_agg = df_goals.groupby("minute").size().reset_index(name="Goals")
df_minute_agg['minute'] = df_minute_agg['minute'].astype(int)
# Create a selection for zooming and panning across the x-axis
scale = alt.selection_interval(bind='scales', encodings=['x'])

# Create a selection and condition for the vertical line, annotation dots, and text annotations
nearest = alt.selection_point(on='mouseover', encodings=['x'], nearest=True, empty=False)
opacityCondition = alt.condition(nearest, alt.value(1), alt.value(0))

# Create the base chart and filter to All polls
base3 = alt.Chart(df_minute_agg).mark_line(size=2.5
).encode(
    alt.X('minute:Q', axis=alt.Axis(labels=True, title="Minute", grid=True, tickCount=alt.TickCount(10))),
    y='Goals:Q',
).add_params(scale).properties(
    title='Timing of World Cup Goals'
)

# Static line chart

# Vertical line
selectors = alt.Chart(df_minute_agg).mark_point().encode(
    x='minute:Q',
    opacity=alt.value(0),
).add_params(
    nearest
)

rules = alt.Chart(df_minute_agg).mark_rule(size=4, color='lightgray').encode(
    x='minute:Q'
).transform_filter(
    nearest
)
#interaction dots
points = base3.mark_point(size=90).encode(
    opacity= opacityCondition  
)
#interaction text labels
text = base3.mark_text(fontSize=14, align='left', dx=7, color="lightgray"
).encode(
    text=alt.condition(nearest, 'Goals:Q', alt.value(' '), format='.2f')
)
#Put them all together
final_viz3 = alt.layer(
    base3, selectors, points, rules, text
).properties(
    width=800, height=300
)

final_viz3

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


# Visualization 4: Zooming in on Goals of World Cup winners since 1950

We will replicate the following visualization: <br>
<img src="https://raw.githubusercontent.com/grill/SI649-hw-interaction/main/zoom.png?raw=true" alt="drawing" width="800"/>

**Description of the visualization (static):**
*   Use *df_goals* for this exercise
*   This visualization has 2 components: **scatter chart original** and **line chart zoomed in** 

**Description of the visualization (interactivity):**
1. Build drop down selections for the home and away team. Theoretically, two teams will be shown at any given time. <br>
<img src="https://raw.githubusercontent.com/grill/SI649-hw-interaction/main/zoom_select.gif?raw=true" alt="drawing" width="400"/>
2. Brushing over the scatter chart will change the color of the points. <br>
<img src="https://raw.githubusercontent.com/grill/SI649-hw-interaction/main/zoom_mouse_select.gif?raw=true" alt="drawing" width="400"/>
3. Brushing over the scatter chart will filter out the associated time interval to create a line chart. <br>
<img src="https://raw.githubusercontent.com/grill/SI649-hw-interaction/main/zoom_interaction.gif?raw=true" alt="drawing" width="800"/>

In [224]:
df_4 = df_goals[df_goals['team_name'].isin(wc_winners)]
# df_4 = df_4[df_4['tournament_id'].str[-4:].astype(int) >= 1950]
# df_4

In [225]:
df_4['Goals'] = df_4.groupby(['minute', "team_name"])['team_name'].transform('count')
df4 = df_4.copy()

In [215]:
teams = sorted(df4['team_name'].unique())

team1_param = alt.param(
    name='Team1',
    value=teams[0],
    bind=alt.binding_select(options=teams, name='Team 1')
)

team2_param = alt.param(
    name='Team2',
    value=teams[1] if len(teams) > 1 else teams[0],
    bind=alt.binding_select(options=teams, name='Team 2')
)

team_filter = alt.expr("datum.team_name === Team1 || datum.team_name === Team2")
brush = alt.selection_interval(encodings=['x','y'], name="time_brush")


In [None]:
scatter = alt.Chart(df4).mark_circle(size=50, opacity=0.8).encode( 
    x=alt.X('minute:Q', title='Minute'),
    y=alt.Y('Goals:Q', title='Goals', scale=alt.Scale(domain=[0,15]), axis=alt.Axis(values=[0,5,10,15])),
    tooltip=['team_name', 'minute', 'Goals'],
    color=alt.Color('team_name:N', title='team_name', scale=alt.Scale(range=['blue', 'orange']))
).transform_filter("datum.team_name == Team1 || datum.team_name == Team2"
).add_params(team1_param, team2_param
).add_params(brush
).properties(
    width=400,
    height=200,
    title='Zooming in on Goals of World Cups winner since 1950'
)

scatter

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [217]:
line = alt.Chart(df4).mark_line(strokeWidth=2.5).encode(
    x=alt.X('minute:Q', title='Minute'),
    y=alt.Y('Goals:Q', title='Goals', scale=alt.Scale(domain=[0,15]), axis=alt.Axis(values=[0,5,10,15])),
    color=alt.Color('team_name:N', title='Team'),
    tooltip=['team_name', 'minute', 'Goals']
).transform_filter("datum.team_name == Team1 || datum.team_name == Team2"
).transform_filter(brush
).add_params(team1_param, team2_param
).properties(
    width=400,
    height=200,
)

final_viz4 = alt.hconcat(scatter, line).resolve_scale(color='independent')
final_viz4

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


# Final part

Export all of your visualizations to HTML, then put them all into a single HTML file (as we covered in the lab this week).

Upload this .html file to canvas, along with this notebook

In [226]:
final_viz1.save('chart1.html')
final_viz2.save('chart2.html')
final_viz3.save('chart3.html')
final_viz4.save('chart4.html')

In [227]:
html_content = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>All Altair Charts</title>
    <style>
        body {{
            text-align: center;
            font-family: Arial, sans-serif;
            background-color: black;
        }}
        .chart-container {{
            width: 100%;  /* Ensure full width */
            height: auto; /* Allow height to be dynamic */
            border: none; /* Remove iframe border */
        }}
    </style>
</head>
<body>
    <h1>All Visualizations</h1>

    <h2>Visualization 1</h2>
    <iframe class="chart-container" src="chart1.html" width="100%" height="600"></iframe>

    <h2>Visualization 2</h2>
    <iframe class="chart-container" src="chart2.html" width="100%" height="600"></iframe>

    <h2>Visualization 3</h2>
    <iframe class="chart-container" src="chart3.html" width="100%" height="600"></iframe>

    <h2>Visualization 4</h2>
    <iframe class="chart-container" src="chart4.html" width="100%" height="600"></iframe>

</body>
</html>
"""

# Save the master HTML file
with open("all_charts.html", "w", encoding="utf-8") as f:
    f.write(html_content)

print("Master HTML file saved as 'all_charts.html'. Open it in a browser.")


Master HTML file saved as 'all_charts.html'. Open it in a browser.
