In [1]:
# import
import pandas as pd
import numpy as np
import altair as alt
import json

# enable Altair to work with data with <5k rows
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

In [2]:
# load data
goals = pd.read_csv('goals.csv')

In [3]:
# view data
goals.head()

Unnamed: 0,date,home_team,away_team,scorer_team,scorer,minute,own_goal,penalty,point_earned,home_goals,away_goals,winner,winner_team,penalty_point,home_penalties,away_penalties
0,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,44.0,False,False,1,0,4,Uruguay,Away,0,0,0
1,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,55.0,False,False,1,0,4,Uruguay,Away,0,0,0
2,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,70.0,False,False,1,0,4,Uruguay,Away,0,0,0
3,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,75.0,False,False,1,0,4,Uruguay,Away,0,0,0
4,1916-07-06,Argentina,Chile,Argentina,Alberto Ohaco,2.0,False,False,1,6,1,Argentina,Home,0,2,0


In [4]:
# group data by game
cols = ['date', 'home_team', 'away_team', 'home_goals', 'away_goals', 
        'winner', 'winner_team', 'home_penalties', 'away_penalties']

games = goals[cols].drop_duplicates()

games['year'] =  games['date'].astype('datetime64[ns]').dt.year 

games.reset_index(drop=True, inplace=True)

print(games.shape)
games.head()

(13986, 10)


Unnamed: 0,date,home_team,away_team,home_goals,away_goals,winner,winner_team,home_penalties,away_penalties,year
0,1916-07-02,Chile,Uruguay,0,4,Uruguay,Away,0,0,1916
1,1916-07-06,Argentina,Chile,6,1,Argentina,Home,2,0,1916
2,1916-07-08,Brazil,Chile,1,1,Tie,Tie,0,0,1916
3,1916-07-10,Argentina,Brazil,1,1,Tie,Tie,0,0,1916
4,1916-07-12,Brazil,Uruguay,1,2,Uruguay,Away,0,0,1916


In [5]:
# group data by team
cols = ['date', 'year', 'team', 'opponent', 'goals', 'opponent_goals', 
        'penalties_awarded', 'penalties_caused', 'won', 'home']

home_teams = games.copy()
home_teams = home_teams.rename(columns={'home_team': 'team',
                                        'away_team': 'opponent',
                                        'home_goals': 'goals',
                                        'away_goals': 'opponent_goals',
                                        'home_penalties': 'penalties_awarded',
                                        'away_penalties': 'penalties_caused'
                                       })
home_teams['won'] = np.where(home_teams['team'] == home_teams['winner'], 1, 0)
home_teams['home'] = 1
home_teams = home_teams[cols]

away_teams = games.copy()
away_teams = away_teams.rename(columns={'away_team': 'team',
                                        'home_team': 'opponent',
                                        'away_goals': 'goals',
                                        'home_goals': 'opponent_goals',
                                        'away_penalties': 'penalties_awarded',
                                        'home_penalties': 'penalties_caused'
                                       })
away_teams['won'] = np.where(away_teams['team'] == away_teams['winner'], 1, 0)
away_teams['home'] = 0
away_teams = away_teams[cols]

teams = pd.concat([home_teams, away_teams], axis=0).sort_values(by=['date'])

teams.reset_index(drop=True, inplace=True)

In [6]:
# add ISO 3166-1 numeric codes to teams data
iso = pd.read_csv('iso_3166.csv')
teams = pd.merge(teams, iso, left_on='team', right_on='name', how='left')
teams.drop(columns=['name'], inplace=True)

teams.head()

Unnamed: 0,date,year,team,opponent,goals,opponent_goals,penalties_awarded,penalties_caused,won,home,country_code
0,1916-07-02,1916,Chile,Uruguay,0,4,0,0,0,1,152
1,1916-07-02,1916,Uruguay,Chile,4,0,0,0,1,0,858
2,1916-07-06,1916,Argentina,Chile,6,1,2,0,1,1,32
3,1916-07-06,1916,Chile,Argentina,1,6,0,2,0,0,152
4,1916-07-08,1916,Brazil,Chile,1,1,0,0,0,1,76


In [7]:
# set custom theme
def custom_theme():
    return {
        'config': {
            'view': {
                'height': 600,
                'width': 600,
                
            },
            "axis": {
                "labelFontSize": 12, 
                "titleFontSize": 13,
            },
            "title": {
              "fontSize": 14
            },
            "legend": {
              "labelFontSize": 12,
              "titleFontSize": 13
            },
            'mark': {
                'fill': '#005391'
            }
        }
    }

# register the custom theme under a chosen name
alt.themes.register('custom_theme', custom_theme)

# enable the newly registered theme
alt.themes.enable('custom_theme')

ThemeRegistry.enable('custom_theme')

#### Plot 1: Map of Countries

In [8]:
# world topo, edited to add Kosovo
world_topo = alt.topo_feature('world_topo.json', 'countries')

In [9]:
# plot
alt.layer(
    # Earth sphere base layer
    alt.Chart({'sphere': True}).mark_geoshape(fill='#e6f3ff'
        ).project(type='naturalEarth1'
    ),
    # geographic reference lines
    alt.Chart({'graticule': True}).mark_geoshape(
        stroke='#ffffff', strokeWidth=0.5, fill=None
    ),
    # world countries
    alt.Chart(world_topo).mark_geoshape(
        stroke='white', strokeWidth=0.5
    ).transform_lookup(
        lookup='id', from_=alt.LookupData(data=teams, key='country_code', fields=['year'])
    ).transform_aggregate(
        first_year = 'min(year)',
        latest_year = 'max(year)',
        groupby=['team']
    )
).properties(
    height=325,
    title = 'MAP'
)

# e6f3ff
# f2f2f2

#### Plot 2: Teams that Win Most by Year

In [10]:
#Team that won highest proportion of games by year (only include teams that played over median games) (2 pts)

alt.Chart(teams).mark_circle().transform_aggregate(
    count_games='count()',
    count_wins='sum(won)',
    groupby=['team', 'year']
).transform_calculate(
    prop_win='datum.count_wins/datum.count_games'
).transform_joinaggregate(
    highest_prop='max(prop_win)',
    groupby=['year']
).encode(
    alt.X('year:O'),
    alt.Y('highest_prop:Q'),
    alt.Color('team:N')
)