# Data Extraction & Transformation

##### Parsing raw StatsBomb data and storing it in a Pandas DataFrame

---

In [1]:
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from statsbombpy import sb
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

- `requests` is a great library for executing HTTP requests
- `pandas` is a data analysis and manipulation package
- `tqdm` is a clean progress bar library

---

In [2]:
base_url = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/"
comp_url = base_url + "matches/{}/{}.json"
match_url = base_url + "events/{}.json"

These URLs are the locations where the raw StatsBomb data lives. Notice the `{}` in there, which are dynamically replaced with IDs with `.format()`

___

In [3]:
def parse_data(competition_id, season_id):
    matches = requests.get(url=comp_url.format(competition_id, season_id)).json()
    match_ids = [m['match_id'] for m in matches]

    all_events = []
    for match_id in tqdm(match_ids):

        events = requests.get(url=match_url.format(match_id)).json()

        shots = [x for x in events if x['type']['name'] == "Shot"]
        for s in shots:
            try:
                aerial_won_t = s['shot']['aerial_won']
            except KeyError:
                aerial_won_t = False

            try:
                follows_dribble_t = s['shot']['follows_dribble']
            except KeyError:
                follows_dribble_t = False
            
            try:
                first_time_t = s['shot']['first_time']
            except KeyError:
                first_time_t = False

            try:
                open_goal_t = s['shot']['open_goal']
            except KeyError:
                open_goal_t = False

            try:
                deflected_t = s['shot']['deflected']
            except KeyError:
                deflected_t = False
            

            attributes = {
                "match_id": match_id,
                "team": s["possession_team"]["name"],
                "player": s['player']['name'],
                "x": s['location'][0],
                "y": s['location'][1],
                
                "type": s['shot']['type']['name'],
                
                "end_x": s['shot']['end_location'][0],
                "end_y": s['shot']['end_location'][1],
                
                "technique": s['shot']['technique']['name'],
                
                
                "aerial_won": aerial_won_t,

                "follows_dribble": follows_dribble_t,

                "first_time": first_time_t,

                "open_goal": open_goal_t,

                "deflected": deflected_t,

                "xg": s['shot']['statsbomb_xg'],



                "outcome": s['shot']['outcome']['name']
                
            }
            all_events.append(attributes)
            
    return pd.DataFrame(all_events)

The `parse_data` function handles the full Extract & Transform process.

The sequence of events is this:
1. The list of matches is loaded into the `matches` list.
2. Match IDs are extracted into a separate list using a list comprehension on `matches`.
3. Iterate over Match ID's, and load each match's raw data into the `events` list.
4. Shots are extracted into a separate list using a list comprehension as a filter on `events`.
5. Iterate over shots and extract individual features and store them in the `attributes` dictionary.
6. Append each shot's `attributes` into the `all_events` list.
7. Return a Pandas DataFrame from the `all_events` list.

---

In [4]:
competition_id = 11
season_id = 1

- `competition_id = 43` - StatsBomb's Competition ID for the World Cup
- `season_id = 3` - StatsBomb's Season ID for the 2018 Season

The `parse_data` function is executed, and it's output is placed in variable `df`

The progress bar is produced by `tqdm`

---

In [5]:
df = parse_data(competition_id, season_id) #11,1 2017/2018

100%|██████████| 36/36 [00:19<00:00,  1.81it/s]


In [6]:
df1 = parse_data(11,4) #2018/2019

100%|██████████| 34/34 [00:18<00:00,  1.89it/s]


In [7]:
df2 = parse_data(11,2) #2016/2017

100%|██████████| 34/34 [00:18<00:00,  1.88it/s]


In [8]:
df3 = parse_data(11,27) #2015/2016

100%|██████████| 33/33 [00:22<00:00,  1.44it/s]


In [9]:
df4 = parse_data(11,26) #2014/2015

100%|██████████| 38/38 [00:24<00:00,  1.57it/s]


In [10]:
li = [df1,df,df2,df3,df4]
laligadf = pd.concat(li)


In [11]:
laligadf.to_pickle('unmodifiedlaliga')

In [12]:
laligadf = pd.read_pickle('unmodifiedlaliga')
laligadf=laligadf.drop(axis=1, labels='match_id')

laligadf['outcome']=laligadf['outcome'].apply(lambda x: 1 if x=='Goal' else 0)
laligadf['aerial_won']=laligadf['aerial_won'].apply(lambda x: 1 if x==True else 0)





laligadf['follows_dribble']=laligadf['follows_dribble'].apply(lambda x: 1 if x==True else 0)
laligadf['first_time']=laligadf['first_time'].apply(lambda x: 1 if x==True else 0)
laligadf['open_goal']=laligadf['open_goal'].apply(lambda x: 1 if x==True else 0)
laligadf['deflected']=laligadf['deflected'].apply(lambda x: 1 if x==True else 0)
#laligadf['match_id']=str(laligadf['match_id'])
laligadf=pd.get_dummies(laligadf)
laligadf

print(laligadf.info())

laligadfmod = laligadf.reset_index()
laligadfmod.to_pickle('laligadfmod')

KeyError: "['index'] not found in axis"

In [None]:
y=laligadf['outcome']
X=laligadf.loc[:, laligadf.columns!='outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=0.5, 
                                                    shuffle=True, stratify=y)