In [2]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import pandas as pd
import pandasql as ps
from datetime import datetime

In [3]:
from IPython.display import display, HTML

# Get historical FIFA rankings

## 1 - Get ranking dates

### 1.1 Get ranking dates from fifa.com and store in a csv

In [4]:
# get list of all available FIFA ranking pages

fifa_rankings_url = "https://www.fifa.com/fifa-world-ranking/ranking-table/men/"

try:
    page_response = requests.get(fifa_rankings_url, timeout=5)
    
    if page_response.status_code == 200:
        page_content = BeautifulSoup(page_response.content,'lxml')
        ranking_dates = page_content.find('ul', attrs={'class':"fi-ranking-schedule__nav dropdown-menu"})
        
        #print(ranking_dates)
        #date = [t.get_text().strip() for t in ranking_dates.select('.fi-ranking-schedule__nav__item')]
        
        dates = [d.get_text() for d in ranking_dates.find_all('a', href=True)]
        urls = [u['href'] for u in ranking_dates.find_all('a', href=True)]
        rank_ids =  [r['data-value'] for r in ranking_dates.find_all('li')]
       
        rank_dates = pd.DataFrame({
            "date" : dates,
            "url" : urls,
            "rank_id" : rank_ids
        })
        
    else:
        print(page_response.status_code)

except requests.Timeout as e:
    print('Timeout occurred for requested page: ' + fifa_url)
    print(str(e))

In [5]:
#rank_dates.head()

In [6]:
# back up raw data (in case FIFA web site changes)
rank_dates.to_csv('rank_dates.csv') 

### 1.2 Clean up and augment ranking dates data

In [7]:
rank_dates['from_date'] = pd.to_datetime(rank_dates['date'], format= '%d %B %Y')

In [8]:
#rank_dates.head()

In [9]:
# sort by from date (needed in order to get expiration date by looking at next row)
rank_dates.sort_values('from_date', inplace=True)

In [10]:
#rank_dates.head()

In [11]:
# create to_date column based on next row's from date
rank_dates['to_date'] = rank_dates.from_date.shift(-1)

In [12]:
#rank_dates.head()

In [13]:
#subtract one day from to_date to make sure date ranges don't overlap
rank_dates['to_date'] = rank_dates['to_date'] - pd.Timedelta(days=1)

In [14]:
rank_dates.head(5)

Unnamed: 0,date,url,rank_id,from_date,to_date
82,26 May 2010,/fifa-world-ranking/ranking-table/men/rank/id9...,id9276,2010-05-26,2010-08-10
81,11 August 2010,/fifa-world-ranking/ranking-table/men/rank/id9...,id9353,2010-08-11,2010-11-16
80,17 November 2010,/fifa-world-ranking/ranking-table/men/rank/id9...,id9451,2010-11-17,2011-01-11
79,12 January 2011,/fifa-world-ranking/ranking-table/men/rank/id9...,id9507,2011-01-12,2011-05-17
78,18 May 2011,/fifa-world-ranking/ranking-table/men/rank/id9...,id9633,2011-05-18,2011-08-23


In [15]:
rank_dates.tail(5)

Unnamed: 0,date,url,rank_id,from_date,to_date
4,17 May 2018,/fifa-world-ranking/ranking-table/men/rank/id1...,id12189,2018-05-17,2018-06-06
3,07 June 2018,/fifa-world-ranking/ranking-table/men/rank/id1...,id12210,2018-06-07,2018-08-15
2,16 August 2018,/fifa-world-ranking/ranking-table/men/rank/id1...,id12280,2018-08-16,2018-09-19
1,20 September 2018,/fifa-world-ranking/ranking-table/men/rank/id1...,id12315,2018-09-20,2018-10-24
0,25 October 2018,/fifa-world-ranking/ranking-table/men/rank/id1...,id12350,2018-10-25,NaT


In [16]:
# reorder columns
rank_dates_cols = ['rank_id' , 'from_date' , 'to_date' , 'date' , 'url']

rank_dates = rank_dates[rank_dates_cols]

In [17]:
rank_dates.head()

Unnamed: 0,rank_id,from_date,to_date,date,url
82,id9276,2010-05-26,2010-08-10,26 May 2010,/fifa-world-ranking/ranking-table/men/rank/id9...
81,id9353,2010-08-11,2010-11-16,11 August 2010,/fifa-world-ranking/ranking-table/men/rank/id9...
80,id9451,2010-11-17,2011-01-11,17 November 2010,/fifa-world-ranking/ranking-table/men/rank/id9...
79,id9507,2011-01-12,2011-05-17,12 January 2011,/fifa-world-ranking/ranking-table/men/rank/id9...
78,id9633,2011-05-18,2011-08-23,18 May 2011,/fifa-world-ranking/ranking-table/men/rank/id9...


## 2 - Get rankings for all historical dates

In [18]:
# this function gets rankings from FIFA.com for a given rank_id (obtained in #1 above)
def get_fifa_rankings(rank_id):

    fifa_rankings_url = "https://www.fifa.com/fifa-world-ranking/ranking-table/men/rank/"
    
    fifa_rankings_url += rank_id

    try: 
        page_response = requests.get(fifa_rankings_url, timeout=5)

        if page_response.status_code == 200:
            page_content = BeautifulSoup(page_response.content,'lxml')
            standings_table = page_content.find('table', attrs={'id':"rank-table"}).find('tbody')
            team = [t.get_text() for t in standings_table.select('.fi-t__nText')]
            team_abbr = [t.get_text() for t in standings_table.select('.fi-t__nTri')]
            point = [r.get_text() for r in standings_table.select('.fi-table__points')]
            rank = [p.get_text() for p in standings_table.select('.fi-table__rank')]

            rank_table = pd.DataFrame({
                "rank_id": rank_id,
                "team" : team,
                "team_abbr" : team_abbr,            
                "rank"  : rank,
                "points": point
            })
            
        else:
            print(page_response.status_code)

    except requests.Timeout as e:
        print('Timeout occurred for requested page: ' + fifa_url)
        print(str(e))


    return rank_table

In [19]:
# test
#get_fifa_rankings('id12315')

In [22]:
# takes a couple of minutes to run - avoid running if possible and use data from
def get_all_fifa_rankings():
    count = 0

    all_rankings = pd.DataFrame()

    for index, row in rank_dates.iterrows():

        rows_left = len(rank_dates) - count

        print('{} rows to go....'.format(rows_left))

        rank_id = row['rank_id']

        print(rank_id)

        rankings = get_fifa_rankings(rank_id)

        all_rankings = pd.concat([all_rankings, rankings])

        count += 1

    all_rankings.reset_index(drop=True)
    
    return all_rankings
    

In [23]:
# comment out to avoid inadvertently rerunning
#get_all_fifa_rankings()

In [24]:
# store data to make sure we have a hard copy
#all_rankings.to_csv('all_rankings.csv')

In [25]:
all_rankings = pd.read_csv('data/team/all_rankings.csv')

In [None]:
all_rankings.head()

In [None]:
all_rankings.set_index('rank_id', inplace=True, drop=True)

In [None]:
all_rankings.head()
all_rankings.drop('Unnamed: 0', axis = 1)

In [None]:
fifa_rankings = rank_dates.join(all_rankings, on=['rank_id'], how='inner')

In [None]:
fifa_rankings.head()

In [None]:
fifa_rankings_cols = ['rank_id' , 'from_date' , 'to_date' , 'team' , 'team_abbr' , 'rank' , 'points']

fifa_rankings = fifa_rankings[fifa_rankings_cols]

In [None]:
fifa_rankings.set_index(['rank_id'], drop=True, inplace=True)

In [None]:
fifa_rankings.head()

In [None]:

#save final ranking data set
fifa_rankings.to_csv('fifa_rankings.csv')