**DATA 5310: Data Visualization | Seattle University | Fall 2023**

Samikshya Pandey, McKenzie Maidl, and Emma Oriol

This file uses the cleaned 'goals' data to create visualizations focused on three topics: general game trends, team trends, and player trends.

In [1]:
# libarary imports
import pandas as pd
import numpy as np
import altair as alt
import json
import pycountry_convert as pc

# enable Altair plots for more than 5,000 data points
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

### Load Data

In [2]:
# load goals data
goals = pd.read_csv('Data/goals.csv')
goals.head()

Unnamed: 0,date,home_team,away_team,scorer_team,scorer,minute,own_goal,penalty,point_earned,home_goals,away_goals,winner,winner_team,penalty_point,home_penalties,away_penalties
0,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,44.0,False,False,1,0,4,Uruguay,Away,0,0,0
1,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,55.0,False,False,1,0,4,Uruguay,Away,0,0,0
2,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,70.0,False,False,1,0,4,Uruguay,Away,0,0,0
3,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,75.0,False,False,1,0,4,Uruguay,Away,0,0,0
4,1916-07-06,Argentina,Chile,Argentina,Alberto Ohaco,2.0,False,False,1,6,1,Argentina,Home,0,2,0


### Set Theme

In [3]:
# set custom theme
def custom_theme():
    return {
        'config': {
            'view': {
                'height': 400,
                'width': 600
            },
            "axis": {
                "labelFontSize": 12, 
                "titleFontSize": 13
            },
            "title": {
              "fontSize": 14
            },
            "legend": {
              "labelFontSize": 12,
              "titleFontSize": 13
            },
            'mark': {
                'fill': '#005391'
            }
        }
    }

# register the custom theme under a chosen name
alt.themes.register('custom_theme', custom_theme)

# enable the newly registered theme
alt.themes.enable('custom_theme')

ThemeRegistry.enable('custom_theme')

### Game Plots

The following plots explore game-level trends.

#### Plot 1: Average Goals per Game

##### Prepare Data for Plot

In [4]:
total_goal = goals.copy()

# determine teams in match
total_goal['team_pair'] = total_goal['home_team'] + '_' + total_goal['away_team']

# calculate total goals per game
total_goal['total_goals'] = total_goal['home_goals'] + total_goal['away_goals']

# group to get unique games played (based on team pair)
total_goal = total_goal.groupby(['team_pair', 'date']).first().reset_index()

# calculate average goals per game
avg_goal_per_game = total_goal['total_goals'].sum() / len(total_goal['total_goals'])

# create dataframe with average goal
average_score = pd.DataFrame({'target_value': [avg_goal_per_game]})

##### Create Plot

In [5]:
# distribution of goals per game
avg_goals = alt.Chart(total_goal).mark_bar(size=20, clip=True
    ).encode(
      alt.X('total_goals:Q', title='Number of Goals', 
            scale=alt.Scale(domain=[-0.5,13.5])),
      alt.Y ('count():Q', title='Number of Matches')
    ).properties(title='Distribution of Goals per Game', height=300)

# add average line
line_chart = alt.Chart(average_score).mark_rule(color='darkred'
    ).encode(
        alt.X('target_value:Q', scale=alt.Scale(domain=[-0.5,13.5]))
    )

# label line
text_chart = alt.Chart(average_score).mark_text(text='Average Score 3.09', color='darkred',
        align='left', baseline='middle', dx=5, dy=15
    ).encode(
        alt.X('target_value:Q', scale=alt.Scale(domain=[-0.5,13.5])),
        alt.YValue(0)
    )

# combine charts
games1 = alt.layer(avg_goals, line_chart,text_chart)
games1

#### Plot 2: Time Between Goals

After a team scores a goal, how quicky does the second goal follow?

##### Prepare Data for Plot

In [6]:
goals_timediff = goals.copy()

# get time between goals
goals_timediff['time_between_goals'] = goals_timediff.groupby(
    ['date','home_team', 'away_team'])['minute'].diff()

# check if same team scored two goals consecutively
goals_timediff['next_goal_team'] = goals_timediff['scorer_team'].eq(
    goals_timediff['scorer_team'].shift()).map({True: 'Same', False: 'Opposite'})

# determine teams in match
goals_timediff['team_pair'] = goals_timediff['home_team'] + '_' + goals_timediff['away_team']

# calculate total goals per game
goals_timediff['total_goals'] = goals_timediff['home_goals'] + goals_timediff['away_goals']

##### Create Plot

In [7]:
games2 = alt.Chart(goals_timediff).mark_bar(clip = True).transform_filter(
        #time_between_goals is null at the first goal
        'datum.time_between_goals != null'
    ).transform_filter(
        #next_goal_team is null at the first goal
        'datum.next_goal_team != null'
    ).encode(
        alt.X('time_between_goals:Q',title='Minutes Between Goals', scale=alt.Scale(domain=[0,80])),
        alt.Y('count():Q', title = 'Number of Matches'),
        alt.Color('next_goal_team:N', scale=alt.Scale(range=['#005391', '#f58518']),
                  title='Scoring Team of Next Goal', 
                  legend=alt.Legend(orient='bottom')),
        alt.Column('next_goal_team:N', header=None)  
    ).properties(width=300, height=300,
                 title=alt.Title(
                           text='Time Between Consecutive Goals by the Same and Opposite Teams',
                           anchor='middle')
                    )

games2

#### Plots 3 and 4 : Impacts of Rule Changes to Goals and Penalties

Three major rule changes in football:
1. 1970: Introduction of red and yellow cards to players where if a team receives a red card, the player has to play with 10 players, 2 or more yellow cards: the player recieves red card
2. 2012: Goal line technology was introduced: the use of electronic aid to determine if a goal has been scored or not. 
3. 2018: VAR is introduced: using video footage by offical to make match decisions like goal/no goal, penatly/no penatly among others. 

##### Prepare Data for Plots

In [8]:
# add year field
total_goal['date'] = pd.to_datetime(total_goal['date'])
total_goal['year'] = total_goal['date'].dt.year

# total penalties
total_goal['total_penatly'] = total_goal['home_penalties'] + total_goal['away_penalties']

##### Create Plots

In [9]:
# create lines for years of new rules
year_1970 = alt.Chart(total_goal).mark_rule(color='darkred').transform_filter(
    'datum.year == "1970"').encode(alt.X('year:O'))

year_2012 = alt.Chart(total_goal).mark_rule(color='darkred').transform_filter(
    'datum.year == "2012"').encode(alt.X('year:O'))

year_2018 = alt.Chart(total_goal).mark_rule(color='darkred').transform_filter(
    'datum.year == "2018"').encode(alt.X('year:O')) 

# add text for each line
text_1970 = alt.Chart().mark_text(
    text='1970 \nCard', color='darkred', align='right',
    baseline='middle', lineBreak='\n', dx=-215, dy=10, size=12).encode(alt.YValue(0))

text_2012 = alt.Chart().mark_text(
    text='2012 \nGoal Line', color='darkred', align='right',
    baseline='middle', lineBreak='\n', dx= 185, dy=10, size=12).encode(alt.YValue(0))

text_2018 = alt.Chart().mark_text(
    text='2018 \nVAR', color='darkred', align='right',
    baseline='middle', lineBreak='\n', dx=245, dy=10, size=12).encode(alt.YValue(0))

# combine line and text for each year
final_1970 = year_1970 + text_1970
final_2012 = year_2012 + text_2012
final_2018 = year_2018 + text_2018

In [10]:
# years for axis labels
years = [1960,1965,1970,1975,1980,1985,1990,1995,2000,2005,2010,2015,2020]

In [11]:
# plot 3: total goals
base_totalgoal = alt.Chart(total_goal).mark_line(fill=None).transform_aggregate(
    total_goals='sum(total_goals)',
    groupby=['year']
).transform_filter(
    'datum.year > 1960'
).encode(
    alt.X('year:O',title = 'Year', axis=alt.Axis(labelAngle=-45, values=years)),
    alt.Y('total_goals:Q', title='Total Goals'),
).properties(title = 'Impact of Rule Changes on Average Total Game Goals', height=350)


#games3 = alt.layer(final_1970, final_2012, final_2018, base_totalgoal)
games3 = final_1970 + final_2012 + final_2018 + base_totalgoal
games3

In [12]:
# plot 4: penalties
base_chart_penalty = alt.Chart(total_goal).mark_line(fill=None).transform_aggregate(
        total_penatly='sum(total_penatly)',
        groupby=['year']
    ).transform_filter(
        'datum.year > 1960'
    ).encode(
        alt.X('year:O',title = 'Year', axis=alt.Axis(labelAngle=-45, values=years)),
        alt.Y('total_penatly:Q', title = 'Total Penalties'),
    ).properties(title = 'Impact of Rule Changes on Average Total Game Penalties', height=350)

games4 = alt.layer(base_chart_penalty, final_1970 , final_2012 , final_2018)
games4

### Team Plots

The following plots explore team-level trends.

##### Prepare Data for Plots

All teams plots use the same data sets

In [13]:
# group data by game
game_cols = ['date', 'home_team', 'away_team', 'home_goals', 'away_goals', 
        'winner', 'winner_team', 'home_penalties', 'away_penalties']

games = goals[game_cols].drop_duplicates()

games['year'] =  games['date'].astype('datetime64[ns]').dt.year 

games.reset_index(drop=True, inplace=True)

In [14]:
# group data by team
team_cols = ['date', 'year', 'team', 'opponent', 'goals', 'opponent_goals', 
        'penalties_awarded', 'penalties_committed', 'won', 'home']

# get home team data
home_teams = games.copy()
home_teams = home_teams.rename(columns={'home_team': 'team',
                                        'away_team': 'opponent',
                                        'home_goals': 'goals',
                                        'away_goals': 'opponent_goals',
                                        'home_penalties': 'penalties_awarded',
                                        'away_penalties': 'penalties_committed'
                                       })
home_teams['won'] = np.where(home_teams['team'] == home_teams['winner'], 1, 0)
home_teams['home'] = 1
home_teams = home_teams[team_cols]

# get away team data
away_teams = games.copy()
away_teams = away_teams.rename(columns={'away_team': 'team',
                                        'home_team': 'opponent',
                                        'away_goals': 'goals',
                                        'home_goals': 'opponent_goals',
                                        'away_penalties': 'penalties_awarded',
                                        'home_penalties': 'penalties_committed'
                                       })
away_teams['won'] = np.where(away_teams['team'] == away_teams['winner'], 1, 0)
away_teams['home'] = 0
away_teams = away_teams[team_cols]

# combine home and away teams
teams = pd.concat([home_teams, away_teams], axis=0)
teams.reset_index(drop=True, inplace=True)

# get continent from country
def get_continent(name):
    continents = {'SA': 'South America', 'EU': 'Europe', 'NA': 'North America', 
                  'AF': 'Africa', 'AS': 'Asia', 'OC': 'Oceania', None: None}
    try:
        code = pc.country_name_to_country_alpha2(name)
        cont = pc.country_alpha2_to_continent_code(code)
        return continents[cont]
    except: 
        pass

# fill in missing data
teams['team_continent'] = teams['team'].apply(get_continent)
teams['team_continent'] = np.where(teams['team'].isin(['Republic of Ireland','Northern Ireland','Wales', 
                                                     'Scotland','England', 'Kosovo']), 'Europe',
                          np.where(teams['team'].isin(['China PR', 'Timor-Leste']), 'Asia',
                          np.where(teams['team'].isin(['DR Congo']), 'Africa', 
                          np.where(teams['team'].isin(['Tahiti']), 'Oceania', teams['team_continent']))))

In [15]:
# team summaries for use in world map
teams_sum = teams.copy()

# get earliest/latest year
teams_sum['min_year'] = teams_sum.groupby(['team'])['year'].transform('min')
teams_sum['latest_year'] = teams_sum.groupby(['team'])['year'].transform('max')

def bin_years(year):
    if year < 1940:
        return '1916-1940'
    elif year < 1960:
        return '1941-1960'
    elif year < 1980:
        return '1961-1980'
    elif year < 2000:
        return '1981-2000'
    else:
        return '2001-2023'
    
teams_sum['min_year_binned'] = teams_sum['min_year'].apply(bin_years)

# get count games
teams_sum['count_games'] = teams_sum.groupby(['team'])['team'].transform('count')

# add ISO 3166-1 numeric codes
iso = pd.read_csv('Data/iso_3166.csv')
teams_sum = pd.merge(teams_sum, iso, left_on='team', right_on='name', how='left')

# get subset of columns
teams_sum.drop(columns=['date','year', 'opponent','goals','opponent_goals','penalties_awarded',
                        'penalties_committed','won','home','name'], axis=1, inplace=True)

# drop duplicates and reset index
teams_sum.drop_duplicates(inplace=True)
teams_sum.reset_index(drop=True, inplace=True)

In [16]:
# world topo, edited to add Kosovo
world_topo = alt.topo_feature('Data/world_topo.json', 'countries')

#### Plot 1: First Appearance by Team

##### Create Plot

In [17]:
# plot 1
domain = ['1916-1940', '1941-1960', '1961-1980', '1981-2000', '2001-2023', 'No Data']
colors = ['#005391', '#0074cc', '#1a9cff', '#66bdff', '#b3deff', '#E0E0E0']

# world countries
teams1 = alt.Chart(world_topo).mark_geoshape(
        stroke='white', strokeWidth=0.5
    ).transform_filter(
        # remove Antarctica
        'datum.id!=10'
    ).project(
        type='naturalEarth1'
    ).transform_lookup(
        lookup='id', from_=alt.LookupData(data=teams_sum, key='id', fields=['min_year_binned'])
    ).encode(
        color = alt.condition('datum.min_year_binned != null', 'min_year_binned:N', alt.value('#E0E0E0'), 
                              scale=alt.Scale(domain=domain, range=colors),
                              title='Appearance Year')
    ).properties(
        height=350,
        width=750,
        title = "First Appearance in Men's International Football (1916-2023)"
    )

teams1

#### Plot 2: Teams with Highest Percent of Wins by Year

##### Create Plot

In [18]:
domain = ['Africa', 'Asia', 'Europe', 'North America', 'South America', 'Multiple']
colors = ['#4c78a8', '#f58518', '#e45756', '#72b7b2', '#54a24b', '#b279a2']

base = alt.Chart(teams).mark_bar(
    ).transform_aggregate(
        # count of games and wins by team/year
        count_games='count(won)',
        count_wins='sum(won)',
        groupby=['team', 'year', 'team_continent']
    ).transform_window(
        # max game count by year
        max_games='max(count_games)',
        frame=[None, None],
        groupby=['year']
    ).transform_calculate(
        # mid-point for max games
        mid='datum.max_games/2',
        # win-rate
        prop_win='datum.count_wins/datum.count_games'
    ).transform_filter(
        # teams that played most games per year
        'datum.count_games >= datum.mid'
    ).transform_window(
        # team(s) with highest win-rate
        max_prop='max(prop_win)',
        frame=[None, None],
        groupby=['year']
    ).transform_filter(
        # filter out teams without highest win-rate
        'datum.prop_win == datum.max_prop'
    ).transform_joinaggregate(
        # max games for teams with highest win-rates
        count_games_max='max(count_games)',
        groupby=['year']
    ).transform_filter(
        # when there is a tie, take the team who played the most games
        'datum.count_games == datum.count_games_max'
    ).transform_calculate(
        # 1996 and 2018 still have a tie
        team_continent_calc='(datum.year != 1996 & datum.year != 2018) ? datum.team_continent : "Multiple"',
    ).encode(
        alt.Y('year:O', title='Year', 
              axis=alt.Axis(values=[1920, 1925, 1930, 1935, 1939, 1945, 1950, 1955, 1960, 1965,
                                    1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020])),
        alt.X('mean(max_prop):Q', title='Percent Games Won',
              axis=alt.Axis(format='%')),
        alt.Color('team_continent_calc:N', title='Team Continent', 
                  scale=alt.Scale(domain=domain, range=colors),
                  legend=alt.Legend(orient='right'))
    ).properties(
        height=500,
        width=200
    )

# split into two columns for better sizing
bars1 = base.transform_filter('datum.year <= 1974')
bars2 = base.transform_filter('datum.year > 1974').encode(alt.Y('year:O', title=None))

# add team names to chart
countries = base.mark_text(align='left', fontSize=11).transform_window(
        t = 'min(team)',
        t2 = 'max(team)',
        frame=[None, None],
        groupby=['year']
    ).transform_calculate(
        text = 'datum.t == datum.t2 ? datum.t : datum.t + "/" + datum.t2'
    ).encode(x=alt.value(202), text=alt.Text('text:N'))

countries1 = countries.transform_filter('datum.year <= 1974')
countries2 = countries.transform_filter('datum.year > 1974')

teams2 = alt.hconcat(bars1 + countries1, 
                     bars2 + countries2,  
                     title=alt.Title(
                           text='Teams with Highest Percent of Wins by Year',
                           subtitle='Including Only Teams that Played at Least Median Count Games by Year',
                           anchor='middle')
                    )
teams2

#### Plot 3: Penalties by Team

##### Create Plot

In [19]:
base = alt.Chart(teams).mark_bar().transform_aggregate(
        mean_penalties_committed = 'mean(penalties_committed)',
        mean_penalties_awarded = 'mean(penalties_awarded)',
        tot_games = 'count(won)',
        tot_wins = 'sum(won)',
        groupby=['team']
    ).transform_calculate(
        win_percentage = 'datum.tot_wins / datum.tot_games'
    ).properties(width=200)

c1 = base.transform_window(
        rank='rank(mean_penalties_committed)',
        sort=[alt.SortField('mean_penalties_committed', order='descending')],
        frame = [None, None]
    ).transform_filter(
        'datum.rank <= 20'
    ).encode(
        alt.Y('team:N', title='Team',
              sort=alt.EncodingSortField(field='mean_penalties_committed', order='descending')),
        alt.X('mean_penalties_committed:Q', title='Mean Penalties Committed per Game', 
              scale=alt.Scale(domain=[0, 0.6]))
    )


c2 = base.transform_window(
        rank='rank(mean_penalties_awarded)',
        sort=[alt.SortField('mean_penalties_awarded', order='descending')],
        frame = [None, None]
    ).transform_filter(
        'datum.rank <= 20'
    ).encode(
        alt.Y('team:N', title=None,
              sort=alt.EncodingSortField(field='mean_penalties_awarded', order='descending')),
        alt.X('mean_penalties_awarded:Q', title='Mean Penalties Awarded per Game', 
              scale=alt.Scale(domain=[0, 0.6]))
    )

teams3 = alt.hconcat(c1,c2, 
                     title=alt.Title(
                           text='Top 20 Teams for Most Penalties Committed and Awarded',
                           subtitle='Normal Time Minutes Only',
                           anchor='middle')
                    )

teams3

#### Plot 4: Goals and Opponent Goals by Team

##### Create Plot

In [20]:
base = alt.Chart(teams).mark_circle(opacity=0.2).transform_aggregate(
        mean_goals = 'mean(goals)',
        mean_opp_goals = 'mean(opponent_goals)',
        tot_games = 'count(won)',
        tot_wins = 'sum(won)',
        groupby=['team', 'team_continent']
    ).transform_calculate(
        win_percentage = 'datum.tot_wins / datum.tot_games',
    ).encode(
        alt.X('mean_goals:Q', title='Mean Team Goals', 
              scale=alt.Scale(domain=[0,3]),
              axis=alt.Axis(values=[0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0])),
        alt.Y('mean_opp_goals:Q', title='Mean Opponent Goals')
    ).properties(title='Team and Opponent Goals per Game', width=700)

points = base.encode(
    alt.Size('win_percentage:Q', title='Win Percentage',
         legend=alt.Legend(format='%')))

# get teams often considered the best in the world
top10 = base.mark_point(color='black', fill=None, size=500).transform_filter(
    {'field': 'team', 'oneOf': ['Brazil', 'Germany', 'Italy', 'Argentina', 'England', 'Spain',
                               'France', 'Netherlands', 'Uruguay', 'Sweden']}
)

top10text_base = base.mark_text().encode(alt.Text('team:N'))

top10text = top10text_base.mark_text(align='right', baseline='top', dy=8, dx=-8, fontSize=12
    ).transform_filter({'field': 'team', 'oneOf': ['Italy', 'Sweden']})
top10text += top10text_base.mark_text(align='right', baseline='bottom', dy=-6, dx=-8, fontSize=12
    ).transform_filter({'field': 'team', 'oneOf': ['Uruguay']})
top10text += top10text_base.mark_text(align='left', baseline='bottom', dy=-8, dx=8, fontSize=12
    ).transform_filter({'field': 'team', 'oneOf': ['Germany']})
top10text += top10text_base.mark_text(align='left', baseline='top', dy=10, dx=10, fontSize=12
    ).transform_filter({'field': 'team', 'oneOf': ['England']})
top10text += top10text_base.mark_text(align='right', baseline='bottom', dy=-12, dx=0, fontSize=12
    ).transform_filter({'field': 'team', 'oneOf': ['Argentina']})
top10text += top10text_base.mark_text(align='right', baseline='bottom', dy=-10, dx=-2, fontSize=12
    ).transform_filter({'field': 'team', 'oneOf': ['Brazil']})
top10text += top10text_base.mark_text(align='left', baseline='top', dy=4, dx=12, fontSize=12
    ).transform_filter({'field': 'team', 'oneOf': ['Netherlands']})
top10text += top10text_base.mark_text(align='right', baseline='top', dy=12, dx=6, fontSize=12
    ).transform_filter({'field': 'team', 'oneOf': ['France']})
top10text += top10text_base.mark_text(align='right', baseline='top', dy=12, dx=0, fontSize=12
    ).transform_filter({'field': 'team', 'oneOf': ['Spain']})

# combine
teams4 = points + top10 + top10text

teams4

### Player Plots

The following plots explore player-level trends.

#### Plot 1: Top Players of All Time

##### Prepare Data for Plot

In [21]:
# new data frame summing up goals per player per year
goals['date'] = pd.to_datetime(goals['date'])
goals['year'] = goals['date'].dt.year
total_goals_by_player = goals.groupby(['year', 'scorer', 'scorer_team'])['point_earned'].sum().reset_index()
total_goals_by_player = total_goals_by_player.rename(columns={'point_earned': 'total_goals'})
total_goals_by_player = total_goals_by_player.sort_values(by='total_goals', ascending=False)

# sum up goals scored by player/team
total_goals_by_player_alltime = total_goals_by_player.groupby(['scorer', 'scorer_team']
                                                             )['total_goals'].sum().reset_index()
total_goals_by_player_alltime = total_goals_by_player_alltime.sort_values(by='total_goals', ascending=False)

# get starting year
def bin_years(year):
    if year < 1940:
        return '1916-1940'
    elif year < 1960:
        return '1941-1960'
    elif year < 1980:
        return '1961-1980'
    elif year < 2000:
        return '1981-2000'
    else:
        return '2001-2023'
    
total_goals_by_player_alltime['min_year'] = goals.groupby(['scorer'])['year'].transform('min')
total_goals_by_player_alltime['min_year_binned'] = total_goals_by_player_alltime['min_year'].apply(bin_years)

##### Create Plot

In [22]:
domain = ['1916-1940', '1941-1960', '1961-1980', '1981-2000', '2001-2023', 'No Data']
colors = ['#005391', '#0074cc', '#1a9cff', '#66bdff', '#b3deff', '#E0E0E0']

base = alt.Chart(total_goals_by_player_alltime.head(10)).mark_bar().encode(
        alt.X('total_goals:Q', title="Total lifetime goals", scale=alt.Scale(domain=[0, 125])),
        alt.Y('scorer:N', title = None, sort='-x'),
        text = 'scorer_team:N'
    ).properties(
        title='Top Ten Scorers of All Time'
    ).encode(
            color = alt.condition('datum.min_year_binned != null', 'min_year_binned:N', alt.value('#E0E0E0'), 
                                  scale=alt.Scale(domain=domain, range=colors),
                                  title='Appearance Year')
    )

# add text
players1 = base.mark_bar() + base.mark_text(align='left', dx=2, color='black')

# rotate the Y-axis title to be horizontal
players1.configure_axis(titleAngle=0)

#### Plot 2: Cristiano Ronaldo Goals

##### Prepare Data for Plot

In [23]:
# filter goal count by ronaldo
ronaldo = total_goals_by_player[total_goals_by_player['scorer'] == 'Cristiano Ronaldo']

# calculate average goals for each player per year
average_scorer_by_year = total_goals_by_player.groupby('year')['total_goals'].mean().reset_index()
average_scorer_by_year = average_scorer_by_year.rename(columns={'total_goals': 'avg_goals_per_player'})

# ronaldo goals, average player goals for each year ronaldo played
ronaldo = ronaldo.merge(average_scorer_by_year, on='year', how='left')

# reformat
ronaldo = pd.melt(ronaldo, id_vars=['year', 'scorer'], value_vars=['total_goals', 'avg_goals_per_player'], 
                    var_name='type', value_name='goal count')
ronaldo['type'] = ronaldo['type'].replace('total_goals', 'Cristiano Ronaldo')
ronaldo['type'] = ronaldo['type'].replace('avg_goals_per_player', 'Average Player')

##### Create Plot

In [24]:
years=[2000, 2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020, 2022]

base = alt.Chart(ronaldo).mark_line(fill = None).encode(
        alt.X('year:O', title='Year', axis=alt.Axis(labelAngle=-45, values=years)),
        alt.Y('goal count:Q', title='Total Goals Per Season'),
        alt.Color('type:N', legend=None)
    ).properties(
        title='Cristiano Ronaldo: How does he compare to his peers?'
    )

c2 = alt.Chart(ronaldo).mark_point().transform_filter(
        alt.datum.year == 2013
    ).encode(
        alt.X('year:O', title="Year", axis=alt.Axis(labelAngle=-45)),
        alt.Y('goal count:Q', title='Total Goals Per Season'),
        alt.Color('type:N', legend=None, scale=alt.Scale(range=['#005391', '#f58518']))
    )

c3 = c2.mark_text(dy = -10).encode(text='type:N')

players2 = base + c3
players2

#### Plot 3: Goals by Year

##### Prepare Data for Plot

In [25]:
# calculate the goals scored per year played ratio
goalyearratio = goals
goalyearratio['scorer_goals'] = goals.groupby('scorer')['point_earned'].transform('sum')
goalyearratio['years_played'] = goals.groupby('scorer')['year'].transform('nunique')
goalyearratio['goals_to_year_ratio'] = goals['scorer_goals'] / goals['years_played']
goalyearratio = goalyearratio[['scorer', 'scorer_goals', 'years_played','goals_to_year_ratio']].drop_duplicates()

##### Create Plot

In [26]:
ratio = alt.Chart(goalyearratio).mark_circle().encode(
        alt.X('years_played:Q', title = 'Years Played'),
        alt.Y('scorer_goals:Q', title = 'Total Goals Scored'),
        fill=alt.condition(alt.datum.scorer == 'Cristiano Ronaldo', alt.value('#f58518'), alt.value('#005391'))
    ).properties(
        title='Goal Frequency'
    )

text = ratio.mark_text(
        align='left',
        baseline='middle',
        dx=-105,
        fontSize=12,
    ).encode(
        x='years_played:Q',
        y='scorer_goals:Q',
        text=alt.condition(alt.datum.scorer == 'Cristiano Ronaldo', 'scorer', alt.value('')),
        color=alt.value('black')
    )

players3 = ratio + text
players3

#### Plot 4: Does Ronaldo Score Early?

##### Prepare Data for Plot

In [27]:
# filter data to Ronaldo/non Ronaldo
goals['ronaldo'] = goals['scorer'] == 'Cristiano Ronaldo'
goals['ronaldo'] = np.where(goals['scorer'] == 'Cristiano Ronaldo', 'Cristiano Ronaldo', 'All other players')

##### Create Plot

In [28]:
players4 = alt.Chart(goals).mark_boxplot(size = 40).transform_filter(
        alt.datum.year == 2004
    ).encode(
        alt.X('ronaldo:N', axis=alt.Axis(labelAngle=0), title = None),
        alt.Y('minute:Q', title = 'Minute Goal Scored', scale=alt.Scale(domain=[0, 90], clamp = True))
    ).properties(
        title='Does Ronaldo score early in games?'
    )

players4

### Save Plots

In [29]:
# games
games1.save('Plots/games1.png')
games2.save('Plots/games2.png')
games3.save('Plots/games3.png')
games4.save('Plots/games4.png')

# teams
teams1.save('Plots/teams1.png')
teams2.save('Plots/teams2.png')
teams3.save('Plots/teams3.png')
teams4.save('Plots/teams4.png')

# players
players1.save('Plots/players1.png')
players2.save('Plots/players2.png')
players3.save('Plots/players3.png')
players4.save('Plots/players4.png')