#**Time series Reports of Premier League**
**Report of Premier League's players and Teams changes over the season 2019/2020**

Notebook made by:
> Marcelo Landivar & Adam Svenson 

Open this notebook in Google Colaboratory:   
>[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/1_RjRfLW-Hw29PT3PpX0mxZE6TdkHE4z8/view?usp=sharing)


###**Notebook description:**

The notebook has as a target to scrape the https://www.fifaindex.com/ website for player indexes over time using BeautifulSoup. The steps are the following:

1. Scrape the website for the information we want.
2. Save that information into a json format to easily access it.
3. Aggregate the data based on teams to get some interesting metrics.
4. Generate plots to be able to see the development of teams over time and team vs. team.
5. Save these plots as a pdf output. 



###**Setup**

In [None]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
import os
import json
from matplotlib.backends.backend_pdf import PdfPages

In [None]:
!pip install PyPDF2
from PyPDF2 import PdfFileMerger

Collecting PyPDF2
[?25l  Downloading https://files.pythonhosted.org/packages/b4/01/68fcc0d43daf4c6bdbc6b33cc3f77bda531c86b174cac56ef0ffdb96faab/PyPDF2-1.26.0.tar.gz (77kB)
[K     |████████████████████████████████| 81kB 2.2MB/s 
[?25hBuilding wheels for collected packages: PyPDF2
  Building wheel for PyPDF2 (setup.py) ... [?25l[?25hdone
  Created wheel for PyPDF2: filename=PyPDF2-1.26.0-cp36-none-any.whl size=61086 sha256=379018652d818349cbcc00f34ce9067e0a7a84ef4447fcdd04f39f0f34336e63
  Stored in directory: /root/.cache/pip/wheels/53/84/19/35bc977c8bf5f0c23a8a011aa958acd4da4bbd7a229315c1b7
Successfully built PyPDF2
Installing collected packages: PyPDF2
Successfully installed PyPDF2-1.26.0


###**Web Scraping and Saving it into a JSON file**

In [None]:
#building a function that gets all the links that we need to do the scraping
def get_links(base_url, url_to_search):
  #getting the links for all the different teams to add to the base_url
  response = requests.get(url_to_search)
  soup = BeautifulSoup(response.content,"html.parser")

  df_teams_links = pd.DataFrame(columns=['links', 'teams'])

  links = soup.find_all('td',{'data-title':'Name'})

  for x in links:
    p = x.find('a', {'class':'link-team'}, href=True)
    df_teams_links = df_teams_links.append({'links':p.get('href'), 'teams':x.text}, ignore_index=True)

    #defining the different links
    final_url = []

    for pages in df_teams_links.links:
        f = base_url+str(pages)
        final_url.append(f)
    
    df_teams_links['final_url'] = final_url
  
  return df_teams_links

In [None]:
url_to_search = "https://www.fifaindex.com/teams/?league=13&order=desc"
base_url = "https://www.fifaindex.com"
df_teams_links = get_links(base_url, url_to_search)

In [None]:
# WAIT ARE WE  EVEN USING  THIS ONE?
# function for creating the json structure that  wewant 
def get_json_data(links: pd.DataFrame, base_url):

  dates = dict()
  for i in links.teams:
    dates[str(i)] = { # each team will be a key in the json and have dates, links,players as subs underneath
              'Dates':[], 
              'links':[],
              'Player':[], 
              'Score':[]
              }

  for i, team in links.iterrows(): 
    new_request = requests.get(team.final_url)
    soup = BeautifulSoup(new_request.content, 'html.parser')
    for links in soup.find_all('a', class_='dropdown-item', href=True): # gets all the dates links we need
      if links['href'].startswith(str(team.links)+'fifa20'):
          dates[str(team.teams)]['Dates'].append(links.text), dates[str(team.teams)]['links'].append(base_url+str(links['href']))
    
    for x in dates[str(team.teams)]['links']:
      scores = []
      players = []
      request = requests.get(x)

      #looping over each URL and doing the actions we define below
      content = request.content
      soup2 = BeautifulSoup(content,"html.parser")

      #getting name of each player in the different teams
      f = soup2.find_all("td",{"data-title":"Name"})
      for x in f:
          x = x.text
          players.append(x)
      dates[team.teams]['Player'].append(players)
      

      # getting score of each player in each team
      score1 = soup2.find_all("td",{"data-title":"OVR / POT"})
      for x in score1:
          x = x.text
          x = int(x[:-2])
          scores.append(x)
      dates[team.teams]['Score'].append(scores)


  
  return dates
      


In [None]:
dates = get_json_data(df_teams_links, base_url)

###**Saving Data and preparing the Data for Reporting**

In [None]:
def save_json_file(path, file_name):
  with open(os.path.join(path, file_name), 'w') as f:
      json.dump(dates, f)

def import_json_dates(path):
  with open(path) as data:
    dates = json.load(data)

In [None]:
with open('/content/data (1).json') as j:
  dates =json.load(j)

In [None]:
# Order of teams in json
# 1. 'Manchester City',
# 2. 'Liverpool',
# 3. 'Tottenham Hotspur',
# 4. 'Manchester United',
# 5. 'Chelsea'
# 6. 'Arsenal',
# 7. 'Leicester City',
# 8. 'West Ham United',
# 9. 'Everton',
# 10. 'Wolverhampton Wanderers',
# 11. 'Watford',
# 12. 'Newcastle United',
# 13. 'Crystal Palace'
# 14. 'AFC Bournemouth'
# 15. 'Burnley'
# 16. 'Aston Villa',
# 17. 'Southampton'
# 18. 'Brighton & Hove Albion'
# 19. 'Sheffield United',
# 20. 'Norwich City'

In [None]:
#getting the average for each date for each team
def get_timeseries_table(dates, df_links:pd.DataFrame):
  
  'Create the timeseries Data Frame'
  df_timeseries = pd.DataFrame(columns=[i for i in df_links.teams])
  df_timeseries['Date'] = dates['Manchester United']['Dates']
  df_timeseries = df_timeseries.set_index('Date')
  df_timeseries.index = pd.to_datetime(df_timeseries.index)

  'Populate the DataFrame'
  for x in df_links.teams:
    for i in range(len(dates[str(x)]['Dates'])):
      y = round(sum(dates[str(x)]['Score'][i])/len(dates[str(x)]['Score'][i]), 5)
      df_timeseries.loc[dates['Manchester United']['Dates'][i],str(x)] = y

  df_timeseries['global_avg'] = df_timeseries.mean(axis=1)
  df_timeseries.index = pd.to_datetime(df_timeseries.index)

  return df_timeseries

In [None]:
df_timeseries = get_timeseries_table(dates, df_teams_links)

###**Generating the Report**

In [None]:
#generating plot for all teams over time
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

def generate_report(df_timeseries:pd.DataFrame, path, file_name):

  fig1, ax = plt.subplots(1, figsize=(20,15))
  ax.plot(df_timeseries.iloc[:,:-1])
  ax.plot(df_timeseries.iloc[:,-1:], linewidth=4.0, color='k',linestyle='dashed')  #global average line
  tick_spacing = 2
  ax.yaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))
  ax.set_ylim(ymin=65, ymax=83)
  ax.set_ylabel('Score')
  ax.set_title('Average score over time')
  ax.legend(df_timeseries.columns, loc='upper right')
  plt.close(fig1) 

  new_df = pd.DataFrame(df_timeseries.mean(axis=0))
  new_df = new_df.sort_values(by= 0, ascending=False)
  new_df = new_df.reset_index()

  # generating overall plot with  average in red
  y = new_df.iloc[:,1]
  x = new_df.iloc[:,0]

  fig2, ax = plt.subplots(1, figsize=(15,10))
  plt.xticks(rotation=90)
  test = ax.bar(x,y)
  test[10].set_color('r')
  plt.close(fig2) 

  pp = PdfPages('report_PL.pdf')
  pp.savefig(fig1)
  pp.savefig(fig2)

  pp.close()

  tick_spacing = 3
  cols_plot = [i for i in df_timeseries.columns]
  axes = df_timeseries[cols_plot].plot(alpha=1, linestyle='solid', figsize=(25, 35), subplots=True)
  for ax in axes:
    hey = ax.set_ylabel('Score')
    hey = ax.set_ylim(ymin=65, ymax=80)
    hey = ax.yaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))
  plt.savefig('individual_teams_evolution.pdf')
  plt.close() 

  
  # merging the pdfs into one
  pdfs = ['report_PL.pdf', 'individual_teams_evolution.pdf']
  merger = PdfFileMerger()

  for pdf in pdfs:
      merger.append(pdf)

  merger.write(os.path.join(path, file_name))
  merger.close()

  # removing unnecessary files we don't need after merge
  os.remove(os.path.join(os.getcwd(), "individual_teams_evolution.pdf"))
  os.remove(os.path.join(os.getcwd(), "report_PL.pdf"))

  return print('Report Generated and Saved')



In [None]:
generate_report(df_timeseries, path='/content/sample_data', file_name='Report_PL_2020.pdf')

Report Generated and Saved
