## Get 2018 world cup results and add to original results file provided by instructors

In [15]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import pandas as pd
import pandasql as ps
import numpy as np
from datetime import datetime

## 1 - Get the 2018 world cup results from FIFA.com

### 1.1 Group stage results

In [None]:
# get list of all available FIFA ranking pages

wc_results_url = "https://www.fifa.com/worldcup/matches/#knockoutphase"


try:
    page_response = requests.get(wc_results_url, timeout=5)
    
    if page_response.status_code == 200:
        page_content = BeautifulSoup(page_response.content,'lxml')
        wc_results = page_content.find('div', attrs={'class':"fi-matchlist"})
        #print(wc_results)
        dates = [s['data-utcdate'] for s in wc_results.select('.fi-mu__info__datetime')]
        scores = [s.get_text().strip() for s in wc_results.select('.fi-s__scoreText')]
        home_scores = [s.get_text().strip().split('-')[0] for s in wc_results.select('.fi-s__scoreText')]       
        away_scores = [s.get_text().strip().split('-')[1] for s in wc_results.select('.fi-s__scoreText')]    
        away_teams = [s.get_text().strip().split('\n')[0]  for s in wc_results.find_all('div',attrs={'class':'fi-t fi-i--4 away'})]
        home_teams = [s.get_text().strip().split('\n')[0] for s in wc_results.find_all('div',attrs={'class':'fi-t fi-i--4 home'})]
        venues  = [v.get_text().strip() for v in wc_results.find_all('div',attrs={'class':'fi__info__venue'})]

        wc_results_df = pd.DataFrame({
            "date" : dates,
            "home_team" : home_teams,
            "away_team" : away_teams,
            "home_score" : home_scores,
            "away_score" : away_scores,
            "tournament" : "FIFA World Cup",
            "city" : venues,
            "country" : "Russia"
        })
        
    else:
        print(page_response.status_code)

except requests.Timeout as e:
    print('Timeout occurred for requested page: ' + fifa_url)
    print(str(e))

In [None]:
wc_results_df

### 1.2 Knockout stage results

In [None]:
# get list of all available FIFA ranking pages

wc_results_url = "https://www.fifa.com/worldcup/matches/#knockoutphase"


try:
    page_response = requests.get(wc_results_url, timeout=5)
    
    if page_response.status_code == 200:
        page_content = BeautifulSoup(page_response.content,'lxml')
        
        wc_results = page_content.find('div', attrs = {'data-tab' : 'knockoutphase'})

        dates = [s['data-utcdate'] for s in wc_results.select('.fi-mu__info__datetime')]
        scores = [s.get_text().strip() for s in wc_results.select('.fi-s__scoreText')]
        home_scores = [s.get_text().strip().split('-')[0] for s in wc_results.select('.fi-s__scoreText')]       
        away_scores = [s.get_text().strip().split('-')[1] for s in wc_results.select('.fi-s__scoreText')]    
        away_teams = [s.get_text().strip().split('\n')[0]  for s in wc_results.find_all('div',attrs={'class':'fi-t fi-i--4 away'})]
        home_teams = [s.get_text().strip().split('\n')[0] for s in wc_results.find_all('div',attrs={'class':'fi-t fi-i--4 home'})]
        venues  = [v.get_text().strip() for v in wc_results.find_all('div',attrs={'class':'fi__info__venue'})]

        wc_results2_df = pd.DataFrame({
            "date" : dates,
            "home_team" : home_teams,
            "away_team" : away_teams,
            "home_score" : home_scores,
            "away_score" : away_scores,
            "tournament" : "FIFA World Cup",
            "city" : venues,
            "country" : "Russia"
        })
        
    else:
        print(page_response.status_code)

except requests.Timeout as e:
    print('Timeout occurred for requested page: ' + fifa_url)
    print(str(e))

In [None]:
wc_results2_df

### 1.3 - Concatenate group and knockout stage

In [None]:
wc_results_df = pd.concat([wc_results_df , wc_results2_df])

In [None]:
wc_results_df

### 1.4 - Add missing columns and clean up

In [None]:
wc_results_df['neutral'] = (wc_results_df['home_team'] != wc_results_df['country'])

In [None]:
wc_results_df['date'] = wc_results_df.date.astype('datetime64').dt.normalize()

In [None]:
wc_results_df.reset_index(drop=True, inplace=True)

In [None]:
wc_results_df

In [None]:
wc_results_df.info()

In [None]:
# back up raw data (in case FIFA web site changes)
wc_results_df.to_csv('wc_results_2018.csv') 

## 2 - Combine with original international results file

In [23]:
intl_results = pd.read_csv('data/fifa/international_results.csv')
wc_results_df = pd.read_csv('data/team/wc_results_2018.csv')
wc_results_df = wc_results_df.drop('Unnamed: 0',axis = 1)
#wc_results_df.tail(64)

In [24]:
wc_results_df['home_team'] = np.where(wc_results_df['home_team'] == 'IR Iran', 'Iran', wc_results_df['home_team'])
wc_results_df['away_team'] = np.where(wc_results_df['away_team'] == 'IR Iran', 'Iran', wc_results_df['away_team'])
intl_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39638 entries, 0 to 39637
Data columns (total 9 columns):
date          39638 non-null object
home_team     39638 non-null object
away_team     39638 non-null object
home_score    39638 non-null int64
away_score    39638 non-null int64
tournament    39638 non-null object
city          39638 non-null object
country       39638 non-null object
neutral       39638 non-null bool
dtypes: bool(1), int64(2), object(6)
memory usage: 2.5+ MB


In [25]:
#wc_results_df.tail(64)

In [26]:
#wc_results_df.head()

In [27]:
intl_results['date'] = intl_results.date.astype('datetime64')

In [28]:
intl_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39638 entries, 0 to 39637
Data columns (total 9 columns):
date          39638 non-null datetime64[ns]
home_team     39638 non-null object
away_team     39638 non-null object
home_score    39638 non-null int64
away_score    39638 non-null int64
tournament    39638 non-null object
city          39638 non-null object
country       39638 non-null object
neutral       39638 non-null bool
dtypes: bool(1), datetime64[ns](1), int64(2), object(5)
memory usage: 2.5+ MB


In [29]:

intl_results = intl_results[intl_results['date'] < datetime.strptime('2018-06-14', '%Y-%m-%d')]
intl_results = intl_results.iloc[:,0:9]
intl_results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [30]:
intl_results.tail()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
39601,2018-06-10,Austria,Brazil,0,3,Friendly,Vienna,Austria,False
39602,2018-06-11,Korea Republic,Senegal,0,2,Friendly,Grödig,Austria,True
39603,2018-06-11,Belgium,Costa Rica,4,1,Friendly,Brussels,Belgium,False
39604,2018-06-12,Japan,Paraguay,4,2,Friendly,Innsbruck,Austria,True
39605,2018-06-12,Poland,Lithuania,4,0,Friendly,Warsaw,Poland,False


In [31]:
full_results = pd.concat([intl_results , wc_results_df])

In [32]:
full_results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30 00:00:00,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08 00:00:00,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07 00:00:00,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06 00:00:00,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04 00:00:00,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [33]:
full_results.tail()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
59,2018-07-07,Russia,Croatia,2,2,FIFA World Cup,Sochi,Russia,False
60,2018-07-10,France,Belgium,1,0,FIFA World Cup,St. Petersburg,Russia,True
61,2018-07-11,Croatia,England,2,1,FIFA World Cup,Moscow,Russia,True
62,2018-07-14,Belgium,England,2,0,FIFA World Cup,St. Petersburg,Russia,True
63,2018-07-15,France,Croatia,4,2,FIFA World Cup,Moscow,Russia,True


In [34]:
full_results.set_index('date',inplace=True,drop=True)

In [35]:
full_results.head()

Unnamed: 0_level_0,home_team,away_team,home_score,away_score,tournament,city,country,neutral
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1872-11-30 00:00:00,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1873-03-08 00:00:00,England,Scotland,4,2,Friendly,London,England,False
1874-03-07 00:00:00,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
1875-03-06 00:00:00,England,Scotland,2,2,Friendly,London,England,False
1876-03-04 00:00:00,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [36]:
full_results.to_csv('data/team/new_intl_results.csv')