In [None]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import display_html
import numpy as np

# Source of code: https://stackoverflow.com/questions/28763891/what-should-i-do-when-tr-has-rowspan
import html_table_to_pandas as html_pd

# Football data from wikipedia

The objective of this notebook is to extract information on British football teams from Wikipedia. This primarily utilises the wikipedia API to extract the data, then Beautiful soup is used to manipulate the data.

## Wikipedia API

The documentation for the wikipedia API is here:
https://www.mediawiki.org/wiki/API:Main_page

The inputs to the get request are:
- URL: the wikipedia API endpoint used throughout.
- TITLE: the title of the page to be scraped
- PARAMS: the API takes its input through parameters provided by the query string

In [None]:
S = requests.Session() # This object allows you to persist certain parameters across requests. Results in better performance.

URL = "https://en.wikipedia.org/w/api.php" # English wikipedia API endpoint

TITLE = "List of football clubs in England"

PARAMS = {
    'action': 'parse', # the action is to parse the content of the page
    'page': TITLE,
    'format': 'json'   # the API can return other formats but they are standardising to JSON
    #'UTF8':            # convert to UTF-8 encoding - does this work? Is it already UTF-8?
}

### Requesting data from a page
Next, we must request the data from the wiki page and turn it into a python object that we can easily manipulate. We must interpret the request data as JSON in order to convert it into __class 'dict'__.

The scraped data are also stored in a JSON file for future use so that we do not need to make the same request multiple times.

In [None]:
'''
# Get the response data as a python object. 
R = S.get(url=URL, params=PARAMS)
DATA = R.json() # Interpret the request data as json: print(type(DATA)) --> <class 'dict'>

# Write to JSON file
with open('list_clubs_in_england.json', 'w') as f:
    json.dump(DATA, f)
'''

In [None]:
# Read JSON file
with open('list_clubs_in_england.json') as data_file:
    DATA = json.load(data_file)

### The data structure within the dictionary object:

In [None]:
DATA.keys()

In [None]:
parse=DATA['parse']
parse.keys()

In [None]:
text = parse['text']
type(text['*'])

### Manipulating the html

#### Defining the function(s)
Here is the original code where the following functions are defined: https://stackoverflow.com/questions/28763891/what-should-i-do-when-tr-has-rowspan

Very useful when a table has rowspan and colspan.

#### Producing the pandas dataframe

In [None]:
# Parsing the HTML using Beautiful Soup
soup = BeautifulSoup(DATA['parse']['text']['*'], "html.parser")
# print(soup.prettify())

# Extract all of the tables from the soup and put them into separate elements of a list
alltables = soup.findAll('table')
print("Number of tables found : " , len(alltables))

In [None]:
alltab_parse = [html_pd.main(alltables[tab]) for tab in range(1, 25)]

result = pd.concat(alltab_parse)
result = result.drop(result.loc[result.index == 0]).reset_index(drop=True)
result.columns = ['Club', 'League/Division', 'Level','Nickname','Change 2017-2018']
result.head()

In [None]:
# There are 20 teams in the Premier League
prem_teams = result.loc[result['League/Division'] == 'Premier League']
prem_teams.head()

## Extract team information on Premier League

In [None]:
'''
# Extract data on all Premier League teams, convert from JSON to dictionary structure. Store each dictionary structure in a list
club_data_json = [(S.get(url=URL, params={'action': "parse",'page': row.Club,'format': "json"})).json() for row in prem_teams.itertuples()]

# Write to a file. Each teams data is written on a single line
with open('premier_league_club_data.json', 'w') as f:
    json.dump(club_data_json, f)
'''

In [None]:
with open('premier_league_club_data.json') as f:
    club_data_json = json.load(f)

In [None]:
# Place the first team page data into bs4
soup_club1 = BeautifulSoup(club_data_json[0]['parse']['text']['*'], "html.parser")

alltables_club1 = soup_club1.findAll("table")

html_content = str(alltables_club1[2])
display_html(html_content, raw=True)

In [None]:
dfs = pd.read_html(html_content)
dfs[1]

In [None]:
players_club1 = pd.concat([dfs[1], dfs[2]])
players_club1 = players_club1.drop(players_club1.loc[result.index == 0]).reset_index(drop=True)
players_club1 = players_club1.drop(columns=1)
players_club1.columns = ['No.', 'Position', 'Player']

# Remove link references and roles
players_club1['Player'] = players_club1['Player'].str.extract(r"^(\w+\s\w+)", expand = False)

players_club1.head()

#### Club official info

In [None]:
# Must check the number of columns
club1_officials=parse_html_table(alltables_club1[7], alltables_club1[7])
club1_officials

## Player data

Retrieve data on each individual player.

What kind of data and what format?

### Check for disambiguation page

In [None]:
TITLE_PLAYER = 'Simon Francis'

PARAMS_DISAMB = {
    'action': 'query', # the action is to parse the content of the page
    'titles': TITLE_PLAYER,
    'format': 'json', # the API can return other formats but they are standardising to JSON
    'prop': 'categories'
}

# Get the response data as a python object. 
R = S.get(url=URL, params=PARAMS_DISAMB)
player_1_query = R.json() # Interpret the request data as json: print(type(DATA)) --> <class 'dict'>

In [None]:
def f_disambiguation(player_1_query, TITLE_PLAYER):
    #  Convert to string in order to easily test if contains specific string
    player_1_query = json.dumps(player_1_query)

    if 'Category:All disambiguation pages' in player_1_query:
        TITLE_PLAYER = TITLE_PLAYER + ' (footballer)'
    return TITLE_PLAYER

In [None]:
TITLE_PLAYER = f_disambiguation(player_1_query, TITLE_PLAYER)
TITLE_PLAYER

### Retrieve player info


In [None]:
'''
PARAMS_PLAYER = {
    'action': 'parse', # the action is to parse the content of the page
    'page': TITLE_PLAYER,
    'format': 'json' # the API can return other formats but they are standardising to JSON
}


# Get the response data as a python object. 
R = S.get(url=URL, params=PARAMS_PLAYER)
DATA_PLAYER = R.json() # Interpret the request data as json: print(type(DATA)) --> <class 'dict'>

# Write to JSON file
with open('player_data.json', 'w') as f:
    json.dump(DATA_PLAYER, f)
'''

In [None]:
# Read JSON file
with open('player_data.json') as data_file:
    DATA_PLAYER = json.load(data_file)

In [None]:
# Place the first team page data into bs4
soup_player1 = BeautifulSoup(DATA_PLAYER['parse']['text']['*'], "html.parser")

alltables_player1 = soup_player1.findAll("table")

In [None]:
table = alltables_player1[1]
## run the above functions to extract the data
rows, num_rows, num_cols = pre_process_table(table)
df = process_rows(rows, num_rows, num_cols)

# Re-do the formatting
df = df[~df[1].isin(['Total', 'Career total', 'Season'])] # Remove rows containing totals and headers
df = df.drop([11, 12], axis = 1) # Remove columns containing totals

# Re-do the headers
player_cols = ['Club', 'Season','Division', 'League - Apps', 'League - Goals', 'FA Cup - Apps', 'FA Cup - Goals', 'League Cup - Apps', 'League Cup - Goals', 'Other - Apps', 'Other - Goals']
df.columns = player_cols # Apply the correct headers

# Remove link references from 'Season' column
df['Season'] = df['Season'].str.extract(r"^(\d{4}–\d{2})", expand = False)

# TO DO: convert data type of Season column to date/year

df = df.reset_index(drop=True)
df.head()

### Multiple players
Retrieve player stats given a list of player names.

In [None]:
'''
DATA_PLAYERS_TEAM = []

for index in players_club1.index:
    PLAYER = players_club1.at[index,'Player']
    PARAMS_PLAYER = {
    'action': 'parse', # the action is to parse the content of the page
    'page': PLAYER,
    'format': 'json' # the API can return other formats but they are standardising to JSON
    }
    
    # Get the response data as a python object. 
    R = S.get(url=URL, params=PARAMS_PLAYER)
    DATA_PLAYER = R.json() # Interpret the request data as json: print(type(DATA)) --> <class 'dict'>
    TITLE_PLAYER = f_disambiguation(DATA_PLAYER, PLAYER)
    
    if (TITLE_PLAYER == PLAYER):
        DATA_PLAYERS_TEAM.append(DATA_PLAYER)
    else:
        PARAMS_PLAYER = {
        'action': 'parse', # the action is to parse the content of the page
        'page': TITLE_PLAYER,
        'format': 'json' # the API can return other formats but they are standardising to JSON
        }

        # Get the response data as a python object. 
        R = S.get(url=URL, params=PARAMS_PLAYER)
        DATA_PLAYER = R.json() # Interpret the request data as json: print(type(DATA)) --> <class 'dict'>
        DATA_PLAYERS_TEAM.append(DATA_PLAYER)

# Write to JSON file
with open('players_data_test.json', 'w') as f:
    json.dump(DATA_PLAYERS_TEAM, f)
'''

In [None]:
# Read JSON file
with open('players_data_test.json') as data_file:
    DATA_PLAYER = json.load(data_file)