**DATA 5310: Data Visualization | Seattle University | Fall 2023**

Samikshya Pandey, McKenzie Maidl, and Emma Oriol

Data: Goals from over 45,000 men's FIFA matches from 1872 through 2023, sourced from https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017/data

This file prepares the raw 'goalscorers.csv' file (titled 'goals_raw.csv' for clarity) for analysis and visualizations.

In [1]:
# library imports
import pandas as pd
import numpy as np
import altair as alt

### Raw Data

In [2]:
# raw data
goals = pd.read_csv('Data/goals_raw.csv')
goals.head()

Unnamed: 0,date,home_team,away_team,team,scorer,minute,own_goal,penalty
0,7/2/1916,Chile,Uruguay,Uruguay,José Piendibene,44.0,False,False
1,7/2/1916,Chile,Uruguay,Uruguay,Isabelino Gradín,55.0,False,False
2,7/2/1916,Chile,Uruguay,Uruguay,Isabelino Gradín,70.0,False,False
3,7/2/1916,Chile,Uruguay,Uruguay,José Piendibene,75.0,False,False
4,7/6/1916,Argentina,Chile,Argentina,Alberto Ohaco,2.0,False,False


### Clean Data

In [3]:
# rename scorer team
goals = goals.rename(columns={'team': 'scorer_team'})

In [4]:
# fix some country names
def fix_countries(col):
    goals[col] = np.where(goals[col] == 'Yugoslavia', 'Serbia',
                     np.where(goals[col] == 'Czechoslovakia', 'Slovakia',
                     np.where(goals[col] == 'German DR', 'Germany',
                     np.where(goals[col] == 'Vietnam Republic', 'Vietnam',
                     np.where(goals[col] == 'Cape Verde', 'Cabo Verde',
                     np.where(goals[col] == 'Saarland', 'Germany', 
                     np.where(goals[col] == 'Yemen DPR', 'Yemen', 
                     np.where(goals[col] == 'Ivory Coast', "Côte d'Ivoire", 
                              goals[col]))))))))

fix_countries('home_team')
fix_countries('away_team')
fix_countries('scorer_team')

In [5]:
# reformat date to datetime
goals['date'] = pd.to_datetime(goals['date'])

In [6]:
# make sure own_goal and penalty datatypes are consistent
goals['own_goal'] = goals['own_goal'].astype(bool)
goals['penalty'] = goals['penalty'].astype(bool)

### Calculate New Fields

In [7]:
# calculate total goals for both teams

# check if point was earned on goal
goals['point_earned'] = np.where(goals['own_goal'] == False, '1', '0')
goals['point_earned'] = goals['point_earned'].astype(int)

# home and away
goals['home_goals'] = goals[goals['scorer_team'] == goals['home_team']].groupby(
    ['date', 'home_team'])['point_earned'].transform('sum')
goals['away_goals'] = goals[goals['scorer_team'] == goals['away_team']].groupby(
    ['date', 'away_team'])['point_earned'].transform('sum')

# fill in nulls
goals['home_goals'] = goals['home_goals'].fillna(0).astype(int)
goals['away_goals'] = goals['away_goals'].fillna(0).astype(int)

# make consistent
goals['home_goals'] = goals.groupby(['date', 'home_team'])['home_goals'].transform('max')
goals['away_goals'] = goals.groupby(['date', 'away_team'])['away_goals'].transform('max')

In [8]:
# calculate game winners

# home or away
goals['winner'] = np.where(goals['away_goals'] > goals['home_goals'], goals['away_team'],
                    np.where(goals['away_goals'] < goals['home_goals'], goals['home_team'], 'Tie'))

# winner team
goals['winner_team' ]= np.where(goals['winner'] == goals['home_team'], 'Home',
                        np.where(goals['winner'] == goals['away_team'], 'Away', 'Tie'))

In [9]:
# calculate penalties by home/away team (pre-overtime)

# set conditions
condPenalty = (goals['penalty'] == True) & (goals['minute'] <= 90.0)
condHome = goals['scorer_team'] == goals['home_team']
condAway = goals['scorer_team'] == goals['away_team']

# columns to group by
cols = ['date', 'home_team', 'away_team']

# group via conditions
goals['penalty_point'] = np.where(condPenalty, 1, 0)
goals['home_penalties'] = goals[condHome].groupby(cols)['penalty_point'].transform('sum')
goals['away_penalties'] = goals[condAway].groupby(cols)['penalty_point'].transform('sum')

# fill nulls
goals['home_penalties'] = goals['home_penalties'].fillna(0).astype(int)
goals['away_penalties'] = goals['away_penalties'].fillna(0).astype(int)

# make consistent
goals['home_penalties'] = goals.groupby(cols)['home_penalties'].transform('max').astype(int)
goals['away_penalties'] = goals.groupby(cols)['away_penalties'].transform('max').astype(int)

### Final Data

In [10]:
# view cleaned and processed data
goals.head()

Unnamed: 0,date,home_team,away_team,scorer_team,scorer,minute,own_goal,penalty,point_earned,home_goals,away_goals,winner,winner_team,penalty_point,home_penalties,away_penalties
0,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,44.0,False,False,1,0,4,Uruguay,Away,0,0,0
1,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,55.0,False,False,1,0,4,Uruguay,Away,0,0,0
2,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,70.0,False,False,1,0,4,Uruguay,Away,0,0,0
3,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,75.0,False,False,1,0,4,Uruguay,Away,0,0,0
4,1916-07-06,Argentina,Chile,Argentina,Alberto Ohaco,2.0,False,False,1,6,1,Argentina,Home,0,2,0


In [11]:
# save to csv
goals.to_csv('Data/goals.csv', index=False)