In [1]:
import altair as alt
import pandas as pd
import numpy as np
import json 

alt.data_transformers.enable('json') #altair plot for more than 5,000 datapoints 

DataTransformerRegistry.enable('json')

In [2]:
#get dataset
#location = '/Users/samikshyapandey/Downloads/'
#file = 'goals.csv'
#goals = location + file
#goals = pd.read_csv(goals)
goals = pd.read_csv('Data/goals.csv')


In [3]:
# set custom theme
def custom_theme():
    return {
        'config': {
            'view': {
                'height': 400,
                'width': 600,
                
            },
            "axis": {
                "labelFontSize": 12, 
                "titleFontSize": 13,
            },
            "title": {
              "fontSize": 14
            },
            "legend": {
              "labelFontSize": 12,
              "titleFontSize": 13
            },
            'mark': {
                'fill': '#005391'
            }
        }
    }

# register the custom theme under a chosen name
alt.themes.register('custom_theme', custom_theme)

# enable the newly registered theme
alt.themes.enable('custom_theme')

ThemeRegistry.enable('custom_theme')

## EXPLORING THE DATASET

In [4]:
goals.head()

Unnamed: 0,date,home_team,away_team,scorer_team,scorer,minute,own_goal,penalty,point_earned,home_goals,away_goals,winner,winner_team,penalty_point,home_penalties,away_penalties
0,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,44.0,False,False,1,0,4,Uruguay,Away,0,0,0
1,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,55.0,False,False,1,0,4,Uruguay,Away,0,0,0
2,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,70.0,False,False,1,0,4,Uruguay,Away,0,0,0
3,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,75.0,False,False,1,0,4,Uruguay,Away,0,0,0
4,1916-07-06,Argentina,Chile,Argentina,Alberto Ohaco,2.0,False,False,1,6,1,Argentina,Home,0,2,0


In [5]:
# multiple rows for each game acocunting for each goal at team
goals.loc[goals["date"] == "1916-07-02"]

Unnamed: 0,date,home_team,away_team,scorer_team,scorer,minute,own_goal,penalty,point_earned,home_goals,away_goals,winner,winner_team,penalty_point,home_penalties,away_penalties
0,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,44.0,False,False,1,0,4,Uruguay,Away,0,0,0
1,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,55.0,False,False,1,0,4,Uruguay,Away,0,0,0
2,1916-07-02,Chile,Uruguay,Uruguay,Isabelino Gradín,70.0,False,False,1,0,4,Uruguay,Away,0,0,0
3,1916-07-02,Chile,Uruguay,Uruguay,José Piendibene,75.0,False,False,1,0,4,Uruguay,Away,0,0,0


If there are 4 goals in the match, the data set would have 4 rows.so it is a long data.

In [6]:
goals.iloc[8]

date                        1916-07-06
home_team                    Argentina
away_team                        Chile
scorer_team                  Argentina
scorer            Alberto Marcovecchio
minute                            67.0
own_goal                         False
penalty                          False
point_earned                         1
home_goals                           6
away_goals                           1
winner                       Argentina
winner_team                       Home
penalty_point                        0
home_penalties                       2
away_penalties                       0
Name: 8, dtype: object

In [7]:
#goals.loc[goals["date"] == "1916-07-06"]

In [8]:
goals["date"].min()

'1916-07-02'

In [9]:
goals.describe()

Unnamed: 0,minute,point_earned,home_goals,away_goals,penalty_point,home_penalties,away_penalties
count,42929.0,43189.0,43189.0,43189.0,43189.0,43189.0,43189.0
mean,50.040206,0.981755,2.579175,1.562828,0.06629,0.170877,0.094376
std,26.363782,0.133839,2.417675,1.699588,0.248791,0.412225,0.311749
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,51.0,1.0,2.0,1.0,0.0,0.0,0.0
75%,73.0,1.0,4.0,2.0,0.0,0.0,0.0
max,122.0,1.0,31.0,17.0,1.0,3.0,3.0


Total dataset: 43189 and ranges from 1916-07-02 to 2023-11-21

In [10]:
#goals[goals["date"].duplicated()]  #multiple games in same data as well

In [11]:
#goals.tail()

## PLOT: What is an average number of goals in a game?
Total scores/total games played.
Here, it is important to note that there could be multiple games on the same date and data are in wide format. So I created a new column that pairs the home and away team so it is easier to calculate the total goals scored in each match. 

In [12]:
total_goal = goals.copy()
total_goal['team_pair'] = total_goal['home_team'] + '_' + total_goal['away_team'] #denotes same match
total_goal["total_goals"] = total_goal["home_goals"] + total_goal["away_goals"]  #home goals and away goals value are sum total of goal scored by each team so adding them together to get total score. 


In [13]:
#total_goal.head()

In [14]:
#group to get unique games played: group my playing teams and date and drop duplicates
total_goal = total_goal.groupby(["team_pair", "date"]).first().reset_index()
total_goal.shape

(13986, 18)

In [15]:
#calculate average goal: 
avg_goal_per_game = total_goal['total_goals'].sum() / len(total_goal['total_goals'])
#print(avg_goal_per_game)
#create dataframe to help with plotting later
average_score = pd.DataFrame({'target_value': [avg_goal_per_game]})
average_score

Unnamed: 0,target_value
0,3.033319


In [16]:
total_goal[(total_goal['date'] > '2017-12-31') & (total_goal['date'] < '2019-01-01')]

Unnamed: 0,team_pair,date,home_team,away_team,scorer_team,scorer,minute,own_goal,penalty,point_earned,home_goals,away_goals,winner,winner_team,penalty_point,home_penalties,away_penalties,total_goals
63,Albania_Israel,2018-09-07,Albania,Israel,Albania,Taulant Xhaka,55.0,False,False,1,1,0,Albania,Home,0,0,0,1
102,Albania_Scotland,2018-11-17,Albania,Scotland,Scotland,Ryan Fraser,14.0,False,False,1,0,4,Scotland,Away,0,0,1,4
247,Andorra_Georgia,2018-11-15,Andorra,Georgia,Georgia,Giorgi Chakvetadze,9.0,False,False,1,1,1,Tie,Tie,0,0,0,2
257,Andorra_Kazakhstan,2018-09-10,Andorra,Kazakhstan,Kazakhstan,Yuriy Logvinenko,68.0,False,False,1,1,1,Tie,Tie,0,0,0,2
461,Argentina_Croatia,2018-06-21,Argentina,Croatia,Croatia,Ante Rebić,53.0,False,False,1,0,3,Croatia,Away,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13450,Uruguay_France,2018-07-06,Uruguay,France,France,Raphaël Varane,40.0,False,False,1,0,2,France,Away,0,0,0,2
13495,Uruguay_Portugal,2018-06-30,Uruguay,Portugal,Uruguay,Edinson Cavani,7.0,False,False,1,2,1,Uruguay,Home,0,0,0,3
13497,Uruguay_Saudi Arabia,2018-06-20,Uruguay,Saudi Arabia,Uruguay,Luis Suárez,23.0,False,False,1,1,0,Uruguay,Home,0,0,0,1
13755,Wales_Denmark,2018-11-16,Wales,Denmark,Denmark,Nicolai Jørgensen,42.0,False,False,1,1,2,Denmark,Away,0,0,0,3


In [17]:
#now we can plot this information 
# clip on x domain
avg_goals = alt.Chart(total_goal).mark_bar(clip = True
).encode(
  alt.X('total_goals:Q', title = "Total number of goals",scale=alt.Scale(domain=[0,12], zero=True)),
  alt.Y ('count():Q', title = "Number of match")
).properties(title = "Distribution of goals scored per game")


avg_goals

Average number of goals in each game is 3.09. It is a discrete data as the number of goals scored cannot be 1.5; its number can be 1,2,3. The graph follows a trend of normal distribution.  

In [18]:
line_chart = alt.Chart(average_score).mark_rule(color='red').encode(
    alt.X('target_value:Q',scale=alt.Scale(zero=True) )
)

text_chart = alt.Chart(average_score).mark_text(
    text='Average Score 3.09',
    color='red',
    align = 'left',
    baseline='middle',
    dx = 2,
    dy = 10
).encode(
    alt.X('target_value:Q', scale=alt.Scale(zero=True)),
    alt.YValue(0)  # Y coordinate for the text
)

#MOve the text in the score
#text_chart

In [19]:
alt.layer(avg_goals, line_chart,text_chart)

## PLOT 2: 

After a team scores a goal, how quicky does the second goal follow? 
We have data on the minute when a player for team scored a goal. We can use this information to observe how quickly following one goal was another goal scored. 


In [20]:
goals["minute"].dtype
goals_timediff = goals.copy()
goals_timediff['time_between_goals'] = goals_timediff.groupby(['date','home_team', 'away_team'])['minute'].diff()
#goals_timediff
#to know if the same team scored two goals consecutively or did opposite team scored the goal
goals_timediff['next_goal_team'] = goals_timediff['scorer_team'].eq(goals_timediff['scorer_team'].shift()).map({True: 'same', False: 'opposite'})

In [21]:
goals_timediff["time_between_goals"].isna().sum
goals_timediff['team_pair'] = goals_timediff['home_team'] + '_' + goals_timediff['away_team'] #denotes same match
goals_timediff["total_goals"] = goals_timediff["home_goals"] + goals_timediff["away_goals"]  #home goals and away goals value are sum total of goal scored by each team so adding them together to get total score. 


In [22]:
#goals_timediff['time_between_goals'].agg(['min', 'max'])


In [23]:
alt.Chart(goals_timediff).mark_bar(clip = True).transform_filter(
    'datum.time_between_goals != null' #time_between_goals is null at the first goal
).transform_filter(
    'datum.next_goal_team != null'#next_goal_team is null at the first goal
).encode(
    alt.X('time_between_goals:Q',title = "Time between Goals", scale=alt.Scale(domain=[0,80])),
    alt.Y('count():Q', title = "Numer of Match"),
    alt.Color('next_goal_team:N', title = "Team scoring next goal"),
   alt.Column('next_goal_team:N', header= None)  
).properties(title = "Time between two consecutive goals")

If a team scores a goal than there is a higher chance that same team will score a goal within next 10 minutes, compared to other team. 
Average time between minutes when two consecutive goals were scored by same team is close to 18 min, for opposite is close to 19.5. 
Could be caused by many things: some(same teams) score more; or same team got more energized or counter attack were not as successful/ 

In [24]:
#goals_timediff.head(20)

In [25]:

average = alt.Chart(goals_timediff).mark_bar().transform_filter(
    'datum.time_between_goals != null'
).encode(
    alt.X('next_goal_team:N'),
    alt.Y('average(time_between_goals):Q'),
    alt.Color('next_goal_team:N')
)

#average
#not using for graph: wanted to confirm above graph is correct by calculating average. 


ON average, after a team has scored a goal, the opposite team with score goal in the next 19 minutes and same will score in the next 18 ih minutes.

## Plot 3 & 4 : Has total goal and total penatly recieved changed  changed after introduction of some major soccer laws? 

See impact of 3 major law change in football:
1. 1970: Introduction of red and yellow cards to players where if a team receives a red card, the player has to play with 10 players, 2 or more yellow cards: the player recieves red card
2. 2012: Goal line technology was introduced: the use of electronic aid to determine if a goal has been scored or not. 
3. 2018: VAR is introduced: using video footage by offical to make match decisions like goal/no goal, penatly/no penatly among others. 


In [26]:
#dataset: total_goal
total_goal.head()
total_goal["date"].dtype
total_goal['date'] = pd.to_datetime(total_goal['date'])

# Extract year and create a new column
total_goal['year'] = total_goal['date'].dt.year
total_goal['year'].dtype

dtype('int64')

In [27]:
total_goal.head()
total_goal["total_penatly"] = total_goal["home_penalties"] + total_goal["away_penalties"]

In [28]:
#total_goal.head()

In [29]:
total_goal["year"].dtype
#total_goal['year'] = pd.to_datetime(total_goal['year'], format='%Y')

dtype('int64')

In [30]:
#1960 as filtering date 
base_totalgoal = alt.Chart(total_goal).mark_line(fill = None).transform_aggregate(
    total_goals='sum(total_goals)',
    groupby=['year']
).transform_filter(
    'datum.year > 1960'
).encode(
    alt.X('year:O',title = "Year", axis=alt.Axis(labelAngle = -45, values =[1960,1965,1970,1975,1980,1985,1990,1995,2000,2005,2010,2015,2020])),
    alt.Y('total_goals:Q', title = " Total Goals"),
).properties(title = "Did change in laws affect average total goal scored?")


#base_totalgoal


In [31]:
year_1970 = alt.Chart(total_goal).mark_rule(color = "Red"
).transform_filter('datum.year == "1970"'
                  ).encode(alt.X('year:O'))


year_2012 = alt.Chart(total_goal).mark_rule(color = "Red"
).transform_filter('datum.year == "2012"'
                  ).encode(alt.X('year:O'))
                         # alt.Y('average(total_goals):Q'))
year_2018 = alt.Chart(total_goal).mark_rule(color = "Red"
).transform_filter('datum.year == "2018"'
                  ).encode(alt.X('year:O')) 

## alt.Y('average(total_goals):Q'))
#year_2012

In [32]:
year_1970 = alt.Chart(total_goal).mark_rule(color="red").transform_filter('datum.year == 1970').encode(
    alt.X('year:O')
)

text_1970 = alt.Chart(total_goal).mark_text(
    text='1970:Card',
    color='red',
    align='left',
    baseline='middle',
    dx=-200,
    dy=10
).encode(
    #alt.X(value = 1970),  # Specify the x-value directly
    alt.YValue(0)      # Y coordinate for the text
)

text_2012 = alt.Chart(total_goal).mark_text(
    text='2012: Goal Line',
    color='red',
    align='left',
    baseline='middle',
    dx= 110,
    dy=10
).encode(
    #alt.X(value = 1970),  # Specify the x-value directly
    alt.YValue(0)      # Y coordinate for the text
)

text_2018 = alt.Chart(total_goal).mark_text(
    text='2018:VAR',
    color='red',
    align='left',
    baseline='middle',
    dx=250,
    dy=10
).encode(
    #alt.X(value = 1970),  # Specify the x-value directly
    alt.YValue(0)      # Y coordinate for the text
)



In [33]:
# Combine the charts
final_1970 = year_1970 + text_1970
final_2012 = year_2012 + text_2012
final_2018 = year_2018 + text_2018

#total_goal = base_chart + final_1970 +

In [34]:
alt.layer(base_totalgoal, final_1970 , final_2012 , final_2018)

Difficult to see any impact of the law change on total goal scored.

In [35]:
# Does the same thing happen total penatly to see if we can find pattern here
total_goal.head()

Unnamed: 0,team_pair,date,home_team,away_team,scorer_team,scorer,minute,own_goal,penalty,point_earned,home_goals,away_goals,winner,winner_team,penalty_point,home_penalties,away_penalties,total_goals,year,total_penatly
0,Afghanistan_Bangladesh,2019-09-10,Afghanistan,Bangladesh,Afghanistan,Farshad Noor,27.0,False,False,1,1,0,Afghanistan,Home,0,0,0,1,2019,0
1,Afghanistan_Cambodia,2015-11-12,Afghanistan,Cambodia,Afghanistan,Mustafa Zazai,42.0,False,False,1,3,0,Afghanistan,Home,0,0,0,3,2015,0
2,Afghanistan_India,2019-11-14,Afghanistan,India,Afghanistan,Zelfy Nazary,45.0,False,False,1,1,1,Tie,Tie,0,0,0,2,2019,0
3,Afghanistan_Japan,2015-09-08,Afghanistan,Japan,Japan,Shinji Kagawa,10.0,False,False,1,0,6,Japan,Away,0,0,0,6,2015,0
4,Afghanistan_Oman,2021-06-11,Afghanistan,Oman,Oman,Abdullah Fawaz,13.0,False,False,1,1,2,Oman,Away,0,0,0,3,2021,0


In [36]:
base_chart_penalty = alt.Chart(total_goal).mark_line(fill = None).transform_aggregate(
    total_penatly='sum(total_penatly)',
    groupby=['year']
).transform_filter(
    'datum.year > 1960'
).encode(
    alt.X('year:O',title = "Year", axis=alt.Axis(labelAngle = -45, values =[1960,1965,1970,1975,1980,1985,1990,1995,2000,2005,2010,2015,2020])),
    alt.Y('total_penatly:Q', title = "Total Penalty"),
)


alt.layer(base_chart_penalty, final_1970 , final_2012 , final_2018)

Volatile changesi in penatly overall so difficult to observe patterns/ impact of rules change in both total goal and total penalty in a year.