## Convert XML files into DataFrames and load into SQLite

In [1]:
import gc
import glob
import logging
import multiprocessing
import numpy as np
import os
import pandas as pd
import re
import warnings
from dateutil import parser
from functools import partial
from lxml import etree
import requests
from bs4 import BeautifulSoup

COMPETITIONS = {
    4: 'World Cup',
    5: 'Champions League',
    6: 'Europa League',
    8: 'English Barclays Premier League',
    9: 'Dutch Eredivisie',
    10: 'Football League Championship',
    21: 'Italian Serie A',
    22: 'German Bundesliga',
    23: 'Spanish La Liga',
    24: 'French Ligue 1',
    98: 'US Major League Soccer',
    114: 'Turkish Super Lig',
    129: 'Russian Premier League',
    199: 'Mexican Liga MX - Apertura',
    214: 'Australian A-League',
    363: 'Brazilian Serie A',
    385: 'Mexican Liga MX - Clausura',
}


TIME_SLICE_EVENTS = [
    'action_areas',
    'all_passes',
    'balls_out',
    'blocked_events',
    'cards',
    'clearances',
    'corners',
    'crosses',
    'extra_heat_maps',
    'fouls',
    'goal_keeping',
    'goals_attempts',
    'headed_duals',
    'interceptions',
    'keepersweeper',
    'offside',
    'oneonones',
    'setpieces',
    'tackles',
    'takeons',
]
ALL_STATISTICS = sorted(TIME_SLICE_EVENTS + ['players', 'teams'])

logger = logging.getLogger()
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)


class SquawkaReport:
    """Squawka match report object.

    :param path: Path to XML-file to generate match report from.
    """

    def __init__(self, path):
        self.__time_slice_events = TIME_SLICE_EVENTS
        self.path = path
        self.xml = self.read_xml(path)

    # See: https://stackoverflow.com/questions/10967551/how-do-i-dynamically-create-properties-in-python
    def __getattr__(self, name):
        if name in self.__time_slice_events:
            return self._parse_timeslice(name)
        else:
            msg = "'{0}' object has no attribute '{1}'"
            raise AttributeError(msg.format(type(self).__name__, name))

    @staticmethod
    def read_xml(path):
        """Read XML file.
        :param path: Path to XML-file.
        :return: XML tree.
        """
        with open(path, 'r') as f:
            data = f.read()
        xml = etree.fromstring(data)
        return xml

    def _parse_timeslice(self, filter_type):
        xpath = '/squawka/data_panel/filters/{filter_type}/time_slice/event'
        return self._get_elements(xpath.format(filter_type=filter_type))

    def _get_elements(self, xpath):
        elements = self.xml.xpath(xpath)
        if elements:
            return self._parse_elements(elements)
        else:
            return None

    def _parse_elements(self, elements):
        parsed = [dict({c.tag: c.text for c in
                        e.getchildren()}.items() + e.attrib.items())
                  for e in elements]
        return parsed

    @property
    def competition(self):
        return re.findall("/(.*)_\d*.xml", self.path)[0]

    @property
    def filters(self):
        filters_element = self.xml.xpath('/squawka/data_panel/filters')
        if filters_element:
            return [ch.tag for ch in filters_element[0].getchildren()]
        # Some match reports don't have data.
        else:
            return None

    @property
    def kickoff(self):
        date = self.xml.xpath("/squawka/data_panel/game/kickoff/text()")[0]
        return parser.parse(date).strftime('%Y-%m-%d %H:%M:%S %z')

    @property
    def match_id(self):
        return int(re.findall("/.*_(\d+).xml", self.path)[0])

    @property
    def name(self):
        return self.xml.xpath("/squawka/data_panel/game/name/text()")[0]

    @property
    def players(self):
        # TODO: Remove non-player elements
        xpath = '/squawka/data_panel/players/player'
        return self._get_elements(xpath)

    @property
    def teams(self):
        xpath = '/squawka/data_panel/game/team'
        return self._get_elements(xpath)

    @property
    def venue(self):
        return self.xml.xpath("/squawka/data_panel/game/venue/text()")[0]

    @property
    def match_info(self):
        info = ({
            'competition': self.competition,
            'kickoff': self.kickoff,
            'match_id': self.match_id,
            'name': self.name,
            'venue': self.venue,
        })
        for team in self.teams:
            for k in ['id', 'short_name']:
                info['_'.join((team['state'], k))] = team[k]
        return info


def stats_from_file(path, statistic, convert=True):
    """Load data for a statistic from file.

    :param path: Path to file.
    :param statistic: Statistic to load (e.g. 'goals_attempts', 'cards').
    :param convert: Process and clean the data (boolean)
    :return pd.DataFrame with data
    """
    report = SquawkaReport(path)
    return stats_from_report(report, statistic, convert)


def stats_from_report(report, statistic, convert=True):
    """Load data for a statistic from a SquawkaReport object.

    :param report: SquawkaReport object
    :param statistic: Statistic to load (e.g. 'goals_attempts', 'cards').
    :param convert: Process and clean the data (boolean)
    :return pd.DataFrame with data
    """
    stats = pd.DataFrame(getattr(report, statistic))
    stats['competition'] = report.competition
    stats['kickoff'] = report.kickoff
    stats['match_id'] = report.match_id
    if convert:
        return convert_export(stats)
    else:
        return stats


def export_all_stats(xml_dir, out_dir, statistics=ALL_STATISTICS, convert=True, n_jobs=None,
                     sequential=('all_passes', 'extra_heat_maps')):
    """Export all statistics from all XML-files in a folder to CSV.

    :param xml_dir: Path to folder containing XML-files
    :param out_dir: Path to folder to save output to
    :param statistics: Statistics to export
    :param convert: Process and clean the data (boolean)
    :param n_jobs: Number of processes to use
    :param sequential: Iterable with statistics to process sequentially (for memory-intensive stats)
    """

    xml_paths = glob.glob(os.path.join(xml_dir, '*.xml'))

    if n_jobs is None:
        n_jobs = multiprocessing.cpu_count() - 1

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    pool = multiprocessing.Pool(n_jobs)
    for statistic in statistics:
        if statistic in sequential:
            df = pd.concat((_load_xml(p, statistic) for p in xml_paths), axis=0, ignore_index=True)
        else:
            partial_loader = partial(_load_xml, statistic=statistic)
            df = pd.concat(pool.imap(partial_loader, xml_paths), axis=0, ignore_index=True)
        if convert:
            df = convert_export(df)
        save_path = os.path.join(out_dir, '{}.csv'.format(statistic))
        df.to_csv(save_path, index=False, encoding='utf8')
        logger.debug("Exported %s to %s", statistic, save_path)


def _load_xml(path, statistic):
    """Load XML files ignoring etree.XMLSyntaxErrors.

    :param path: Path to file.
    :param statistic: Statistic to load (e.g. 'goals_attempts', 'cards').
    :return: XML tree (or None on etree.XMLSyntaxError).
    """
    try:
        return stats_from_file(path, statistic)
    except etree.XMLSyntaxError:
        msg = "XML error loading {}, skipping it...".format(path)
        warnings.warn(msg, RuntimeWarning)


def convert_export(df):
    """Convert a statistics export.
    :param df: pd.DataFrame with statistics (see e.g. stats_from_file())
    :return: processed pd.DataFrame
    """

    def parse_indicator(s, indicator):
        return s.notnull() & (s == indicator)  # Nulls are interpreted as False

    convert_cols = {
        'id': 'int',
        'match_id': 'int',
        'mins': 'int',
        'minsec': 'int',
        'secs': 'int',
        'team_id': 'int'
    }
    coordinate_cols = [
        'end',
        'loc',
        'middle',
        'start',
    ]
    indicator_cols = {
        'is_own': 'yes',
        'headed': 'true',  # Note: ignores all falses
        'shot': 'true',  # Note: ignores all falses
    }
    # Convert strings to ints.
    for col in df.columns.intersection(convert_cols):
        df[col] = df[col].replace('', -1)
        df.loc[df[col].isnull(), col] = -1
        df[col] = df[col].astype(convert_cols[col])

    # Convert indicator cols.
    for col in df.columns.intersection(indicator_cols):
        df[col] = parse_indicator(df[col], indicator_cols[col])

    # Convert coordinate cols.
    for col in df.columns.intersection(coordinate_cols):
        df[[col + '_x', col + '_y']] = split_coordinates(df[col])
        df.drop(col, axis=1, inplace=True)

    return df


def split_coordinates(s):
    """Split Series containing strings with coordinates into a DataFrame.

    :param s: pd.Series
    :return: pd.DataFrame with columns 'x' and 'y'
    """
    if s.notnull().all():
        concatenated = s
    else:
        concatenated = s.copy()
        concatenated.loc[concatenated.isnull()] = ','
    split = pd.DataFrame(concatenated.str.split(',').tolist(), columns=['x', 'y'], dtype=float)
    return split.replace('', np.nan)

In [2]:
# Functions to pre-load for following future stats dataframe

# Create function to re-format date for SQLite db - PHIL CODE
def changedob(value):
    year = value[6:10]
    month = value[3:5]
    day = value[0:2]
    new_dob = year + "-" + month + "-" + day
    return new_dob

# Create function to count goals & assists - PHIL CODE
# def goalsfunc(value):
#     goals = goals_attempts_df[(goals_attempts_df['player_id'] == value) & (goals_attempts_df['type'] == 'goal')]
#     goals = goals.shape[0]
#     return goals

# def assistsfunc(value):
#     assists = goals_attempts_df[(goals_attempts_df['assist_1'] == value) & (goals_attempts_df['type'] == 'goal')]
#     assists = assists.shape[0]
#     return assists

In [3]:
# The following is all PHIL CODE
def masterfunc(filename):
    try:
        # Load file into Squawka Report class
        report = SquawkaReport(filename)

        # Code to generate players dataframe - PHIL CODE
        players_df = pd.DataFrame(report.players)
        players_df['dob'] = players_df['dob'].apply(changedob)
        players_df['bmi'] = players_df['bmi'].astype(float)
        players_df['age'] = players_df['age'].astype(float)
        players_df['idn'] = players_df['id'].astype(float)
        players_df.drop('id', axis = 1, inplace = True)
        players_df['team_id'] = players_df['team_id'].astype(float)
        players_df['shirt_num'] = players_df['shirt_num'].astype(float)
        players_df['height'] = [float(x) if x != 'Unknown' else np.NaN for x in players_df['height']]
        players_df['weight'] = [float(x) if x != 'Unknown' else np.NaN for x in players_df['weight']]

        # Code to generate kickoff year and season code - PHIL CODE
        kickoff_time = report.kickoff
        match_year = int(kickoff_time[0:4])
        match_month = int(kickoff_time[5:7])
        if match_month <= 6:
            start_year = match_year - 1
        if match_month > 6:
            start_year = match_year
        next_year = start_year + 1
        next_year = str(next_year)[2:4]
        season_code = str(start_year)[2:4] + "/" + next_year
        #return start_year, season_code

        # Code to generate goal attempts dataframe and update column data types

        goals_attempts_df = pd.DataFrame(report.goals_attempts)
        try:
            goals_attempts_df['player_id'] = goals_attempts_df['player_id'].astype(float)
        except:
            goals_attempts_df['player_id'] = np.NaN
        try:
            goals_attempts_df['assist_1'] = goals_attempts_df['assist_1'].astype(float)
        except:
            goals_attempts_df['assist_1'] = np.NaN

        def goalsfunc(value):
            goals = goals_attempts_df[(goals_attempts_df['player_id'] == value) & (goals_attempts_df['type'] == 'goal')]
            goals = goals.shape[0]
            return goals

        def assistsfunc(value):
            assists = goals_attempts_df[(goals_attempts_df['assist_1'] == value) & (goals_attempts_df['type'] == 'goal')]
            assists = assists.shape[0]
            return assists

        # Code to pass file into BS4
        file = open(filename)
        page = file.read()
        soup = BeautifulSoup(page, "xml")

        # Code to create teams dataframe to call on team names
        teams_df = pd.DataFrame(report.teams)

        # Code to create dictionaries to store winning team & whether clean sheets
        headline = soup.find('headline').get_text()
        if teams_df.loc[0,'short_name'] in headline:
            headline = headline.replace(teams_df.loc[0,'short_name'],'H')
        if teams_df.loc[0, 'long_name'] in headline:
            headline = headline.replace(teams_df.loc[0,'long_name'],'H')
        if teams_df.loc[1,'short_name'] in headline:
            headline = headline.replace(teams_df.loc[1,'short_name'],'A')
        if teams_df.loc[1, 'long_name'] in headline:
            headline = headline.replace(teams_df.loc[1,'long_name'],'A')
        score = headline
        length = len(score)
        dash_loc = score.index('-')
        if dash_loc == 4:
            h_score = float(score[2])
        elif dash_loc == 5:
            h_score == float(score[2:4])
        away_score_len = length - dash_loc
        a_score_loc = dash_loc + 2
        if away_score_len == 4:
            a_score = float(score[a_score_loc])
        elif away_score_len == 5:
            a_score = float(score[a_score_loc:a_score_loc+2])
        if h_score > a_score:
            h_result = 'W'
            a_result = 'L'
        elif h_score == a_score:
            h_result = 'D'
            a_result = 'D'
        else:
            h_result = 'L'
            a_result = 'W'
        if a_score == 0:
            h_clean = 1
        else: 
            h_clean = 0
        if h_score == 0:
            a_clean = 1
        else:
            a_clean = 0
        result_dict = {float(teams_df.loc[0,'id']): h_result, float(teams_df.loc[1,'id']): a_result}
        clean_sheet_dict = {float(teams_df.loc[0,'id']): h_clean, float(teams_df.loc[1,'id']): a_clean}
    
        # Create function to obtain match stats and push to a DataFrame 
    
        final_stats = soup.find('time_slice', {"name": "85 - 90"}).findAllNext('player_inf_score')
        match_id = report.path.replace("XMLfiles/","").replace(".xml","")

        df_stats = pd.DataFrame(columns=["idn","attack","defense","possession","goalkeeping","score",
                                         "match_id","start_year","season_code"])

        for x in final_stats:
            
            try:
                if x['injurytime_play'] == '1':
                    idn = float(x['id'])
                    attack = float(x['attack'])
                    defense = float(x['defense'])
                    possession = float(x['possession'])
                    goalkeeping = float(x['goalkeeping'])
                    score = float(x.text)
                    df_stats.loc[len(df_stats)] = [idn,attack,defense,possession,goalkeeping,score,match_id,
                                                   start_year,season_code]
            except:
                idn = float(x['id'])
                attack = float(x['attack'])
                defense = float(x['defense'])
                possession = float(x['possession'])
                goalkeeping = float(x['goalkeeping'])
                score = float(x.text)
                df_stats.loc[len(df_stats)] = [idn,attack,defense,possession,goalkeeping,score,match_id,
                                               start_year,season_code]

        # Insert players' goals and assists from game
        df_stats['goals'] = df_stats['idn'].apply(goalsfunc)
        df_stats['assists'] = df_stats['idn'].apply(assistsfunc)

        df_stats = df_stats[df_stats['idn'] != 0]

        # Insert team id
        df_stats = df_stats.merge(players_df[['team_id','idn']], on=['idn'])

        # Insert whether the player started
        df_stats = df_stats.merge(players_df[['state','age','idn']], on=['idn'])
        df_stats['state'] = [1 if x == 'playing' else 0 for x in df_stats['state']]
        df_stats['sub'] = [1 if x == 0 else 0 for x in df_stats['state']]
        df_stats['result'] = df_stats['team_id'].map(result_dict)
        df_stats['clean_sheet'] = df_stats['team_id'].map(clean_sheet_dict)

        # Remove columns not needed before uploaded to SQLite
        players_df = players_df.drop(['photo','profile_url','x_loc','y_loc','total_influence'], axis = 1)

        # Insert players dataframe into SQLite
        players_df.to_sql(name = 'playertable', con = connection, if_exists = 'append', index = False)

        # Insert stats dataframe into SQLite
        df_stats.to_sql(name = 'stats', con = connection, if_exists = 'append', index = False)
    
        return players_df, df_stats
    
    except:
        
        return filename

## Create connection to SQL

In [4]:
import time
from pandas.io import sql
import sqlite3
connection = sqlite3.connect('../sql/stats_extra_games.db.sqlite')

## Create XML file list

In [10]:
list_of_XML_files = []

for filename in os.listdir("XMLfiles/"):
    if filename != '.DS_Store':
        list_of_XML_files.append("XMLfiles/"+filename)

## Create loop to take each XML file, convert into DFs, load into SQL

In [11]:
list_of_broken = []

start_time=time.time()

count = 0
total_files = len(list_of_XML_files)

for xml in list_of_XML_files:
    #print xml
    output = masterfunc(xml)
    if len(output) > 2:
        list_of_broken.append(output)
        print('File error: {}'.format(xml))
    count += 1
    total_files -= 1
    perc = round(float(count) / len(list_of_XML_files)*100,2)
    elapsed_time=time.time()-start_time
    time_remaining = (float(elapsed_time) / count) * total_files
    time_remaining = round(time_remaining / 60,2)
    if count % 10 == 0:
        print('{}/{} XML files uploaded ({}% complete), most recent file: {}. Minutes remaining: {}'
              .format(count, total_files, perc, xml, time_remaining))

10/116 XML files uploaded (7.94% complete), most recent file: XMLfiles/32933. Minutes remaining: 1.66
20/106 XML files uploaded (15.87% complete), most recent file: XMLfiles/32943. Minutes remaining: 1.43
30/96 XML files uploaded (23.81% complete), most recent file: XMLfiles/33325. Minutes remaining: 1.3
40/86 XML files uploaded (31.75% complete), most recent file: XMLfiles/33335. Minutes remaining: 1.16
50/76 XML files uploaded (39.68% complete), most recent file: XMLfiles/34252. Minutes remaining: 1.03
60/66 XML files uploaded (47.62% complete), most recent file: XMLfiles/34262. Minutes remaining: 0.89
70/56 XML files uploaded (55.56% complete), most recent file: XMLfiles/34620. Minutes remaining: 0.75
80/46 XML files uploaded (63.49% complete), most recent file: XMLfiles/34630. Minutes remaining: 0.62
90/36 XML files uploaded (71.43% complete), most recent file: XMLfiles/34934. Minutes remaining: 0.48
100/26 XML files uploaded (79.37% complete), most recent file: XMLfiles/34944. Min

In [8]:
list_of_broken

['XMLfiles/14357',
 'XMLfiles/14389',
 'XMLfiles/14436',
 'XMLfiles/15350',
 'XMLfiles/15558',
 'XMLfiles/27957',
 'XMLfiles/27958',
 'XMLfiles/27959',
 'XMLfiles/27960',
 'XMLfiles/28874',
 'XMLfiles/29636',
 'XMLfiles/29669',
 'XMLfiles/29772',
 'XMLfiles/29778',
 'XMLfiles/29941',
 'XMLfiles/30107',
 'XMLfiles/30900',
 'XMLfiles/31079',
 'XMLfiles/31086',
 'XMLfiles/31131',
 'XMLfiles/31142',
 'XMLfiles/31157',
 'XMLfiles/31355',
 'XMLfiles/32332',
 'XMLfiles/32350',
 'XMLfiles/8336']

### SQLite code to create new table with duplicates removed (execute within SQLite)

CREATE TABLE newplayers AS
SELECT DISTINCT
age, bmi, country, dob, first_name, height, last_name, name, position, shirt_num, state, surname, team_id, team_name, weight, idn
FROM
playertable;

players.head(2)

## Old code

match_stats(xml)
    df_stats.to_sql(name = 'end_game_stats', con = connection, if_exists = 'replace', index = False)
    players_df = pd.DataFrame(report.players)
    players_df.to_sql(name = 'players', con = connection, if_exists = 'replace', index = False)

In [6]:
len(list_of_XML_files)

8023

In [None]:
list_of_XML_files = ["XMLfiles/2296"]

In [6]:
result = masterfunc("XMLfiles/8445")
result

(     age   bmi         country         dob  first_name  height   last_name  \
 0   27.0  21.2           Wales  1989-02-07        Neil   175.0      Taylor   
 1   32.0  23.0           Wales  1984-08-23      Ashley   183.0    Williams   
 2   25.0  22.2         England  1991-05-22        Kyle   185.0     Bartley   
 3   31.0  23.0          Poland  1985-04-18      Lukasz   190.0   Fabianski   
 4   28.0  22.6  Korea Republic  1989-01-24  Sung-yueng   187.0          Ki   
 5   28.0  20.4     Netherlands  1988-05-27      Marvin   180.0       Emnes   
 6   24.0  23.4         England  1992-02-27       Jonjo   185.0     Shelvey   
 7   29.0  22.0         England  1987-11-29      Nathan   165.0        Dyer   
 8   28.0  25.4   Côte d'Ivoire  1988-12-10    Wilfried   182.0        Bony   
 9   32.0  22.1         England  1985-01-07       Wayne   170.0   Routledge   
 10  38.0  23.8         Germany  1978-11-16     Gerhard   190.0     Tremmel   
 11  31.0  22.7          France  1985-08-06   Baféti

goals_attempts

def test(filename):
    
    # Load file into Squawka Report class
    report = SquawkaReport(filename)
    
    # Code to generate players dataframe - PHIL CODE
    players_df = pd.DataFrame(report.players)
    players_df['dob'] = players_df['dob'].apply(changedob)
    players_df['bmi'] = players_df['bmi'].astype(float)
    players_df['age'] = players_df['age'].astype(float)
    players_df['idn'] = players_df['id'].astype(float)
    players_df.drop('id', axis = 1, inplace = True)
    players_df['team_id'] = players_df['team_id'].astype(float)
    players_df['shirt_num'] = players_df['shirt_num'].astype(float)
    players_df['height'] = [float(x) if x != 'Unknown' else np.NaN for x in players_df['height']]
    players_df['weight'] = [float(x) if x != 'Unknown' else np.NaN for x in players_df['weight']]
    
    # Code to generate kickoff year and season code - PHIL CODE
    kickoff_time = report.kickoff
    match_year = int(kickoff_time[0:4])
    match_month = int(kickoff_time[5:7])
    if match_month <= 6:
        start_year = match_year - 1
    if match_month > 6:
        start_year = match_year
    next_year = start_year + 1
    next_year = str(next_year)[2:4]
    season_code = str(start_year)[2:4] + "/" + next_year
    #return start_year, season_code
    
    # Code to generate goal attempts dataframe and update column data types
    goals_attempts = pd.DataFrame(report.goals_attempts)
    try:
        goals_attempts['player_id'] = goals_attempts['player_id'].astype(float)
    except:
        goals_attempts['player_id'] = np.NaN
    try:
        goals_attempts['assist_1'] = goals_attempts['assist_1'].astype(float)
    except:
        goals_attempts['assist_1'] = np.NaN
    return goals_attempts
    
    # Code to pass file into BS4
    file = open(filename)
    page = file.read()
    soup = BeautifulSoup(page, "xml")
    
    # Code to create teams dataframe to call on team names
    teams_df = pd.DataFrame(report.teams)
    
    # Code to create dictionaries to store winning team & whether clean sheets
    headline = soup.find('headline').get_text()
    if teams_df.loc[0,'short_name'] in headline:
        headline = headline.replace(teams_df.loc[0,'short_name'],'H')
    if teams_df.loc[0, 'long_name'] in headline:
        headline = headline.replace(teams_df.loc[0,'long_name'],'H')
    if teams_df.loc[1,'short_name'] in headline:
        headline = headline.replace(teams_df.loc[1,'short_name'],'A')
    if teams_df.loc[1, 'long_name'] in headline:
        headline = headline.replace(teams_df.loc[1,'long_name'],'A')
    score = headline
    length = len(score)
    dash_loc = score.index('-')
    if dash_loc == 4:
        h_score = float(score[2])
    elif dash_loc == 5:
        h_score == float(score[2:4])
    away_score_len = length - dash_loc
    a_score_loc = dash_loc + 2
    if away_score_len == 4:
        a_score = float(score[a_score_loc])
    elif away_score_len == 5:
        a_score = float(score[a_score_loc:a_score_loc+2])
    
    if h_score > a_score:
        h_result = 'W'
        a_result = 'L'
    elif h_score == a_score:
        h_result = 'D'
        a_result = 'D'
    else:
        h_result = 'L'
        a_result = 'W'
    if a_score == 0:
        h_clean = 1
    else: 
        h_clean = 0
    if h_score == 0:
        a_clean = 1
    else:
        a_clean = 0
    result_dict = {float(teams_df.loc[0,'id']): h_result, float(teams_df.loc[1,'id']): a_result}
    clean_sheet_dict = {float(teams_df.loc[0,'id']): h_clean, float(teams_df.loc[1,'id']): a_clean}
    
    #return players_df

In [19]:
report = SquawkaReport("XMLfiles/8445")

In [21]:
goals_attempts = pd.DataFrame(report.goals_attempts)

BROKEN MLS / ALEAGUE / EPL files:
'XMLfiles/27957',
 'XMLfiles/27958',
 'XMLfiles/27959',
 'XMLfiles/27960',
 'XMLfiles/28874',
 'XMLfiles/32332',
 'XMLfiles/32350',
 'XMLfiles/8336'