# Merge international results with new data

In [1]:
import pandasql as ps
from datetime import datetime
from pandasql import sqldf
import numpy as np
import pandas as pd


## 1 - Merge results and rankings

In [28]:
fifa_rankings = pd.read_csv('data/team/fifa_rankings.csv')
fifa_rankings.head()

Unnamed: 0,rank_id,from_date,to_date,team,team_abbr,rank,points
0,id9276,2010-05-26,2010-08-10,Brazil,BRA,1,1611
1,id9276,2010-05-26,2010-08-10,Spain,ESP,2,1565
2,id9276,2010-05-26,2010-08-10,Portugal,POR,3,1249
3,id9276,2010-05-26,2010-08-10,Netherlands,NED,4,1231
4,id9276,2010-05-26,2010-08-10,Italy,ITA,5,1184


In [29]:
fifa_rankings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17365 entries, 0 to 17364
Data columns (total 7 columns):
rank_id      17365 non-null object
from_date    17365 non-null object
to_date      17155 non-null object
team         17365 non-null object
team_abbr    17365 non-null object
rank         17365 non-null int64
points       17365 non-null int64
dtypes: int64(2), object(5)
memory usage: 949.7+ KB


In [30]:
intl_results = pd.read_csv('data/team/new_intl_results.csv')
intl_results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [31]:
intl_results.tail()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
33495,2018-07-07,Russia,Croatia,2,2,FIFA World Cup,Sochi,Russia,False
33496,2018-07-10,France,Belgium,1,0,FIFA World Cup,St. Petersburg,Russia,True
33497,2018-07-11,Croatia,England,2,1,FIFA World Cup,Moscow,Russia,True
33498,2018-07-14,Belgium,England,2,0,FIFA World Cup,St. Petersburg,Russia,True
33499,2018-07-15,France,Croatia,4,2,FIFA World Cup,Moscow,Russia,True


In [32]:
intl_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33500 entries, 0 to 33499
Data columns (total 9 columns):
date          33500 non-null object
home_team     33500 non-null object
away_team     33500 non-null object
home_score    33500 non-null int64
away_score    33500 non-null int64
tournament    33500 non-null object
city          33500 non-null object
country       33500 non-null object
neutral       33500 non-null bool
dtypes: bool(1), int64(2), object(6)
memory usage: 2.1+ MB


In [33]:
# convert date to date
intl_results['date'] = intl_results.date.astype('datetime64')

In [34]:
intl_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33500 entries, 0 to 33499
Data columns (total 9 columns):
date          33500 non-null datetime64[ns]
home_team     33500 non-null object
away_team     33500 non-null object
home_score    33500 non-null int64
away_score    33500 non-null int64
tournament    33500 non-null object
city          33500 non-null object
country       33500 non-null object
neutral       33500 non-null bool
dtypes: bool(1), datetime64[ns](1), int64(2), object(5)
memory usage: 2.1+ MB


In [35]:
# write SQL to join dataframes using "between statement"

pysqldf = lambda q: sqldf(q, globals())
sql = 'select res.*, rnk1.rank home_rank, rnk1.points home_rank_pts, rnk2.rank away_rank, rnk2.points away_rank_pts\
       from intl_results res \
       inner join fifa_rankings rnk1 \
           on res.home_team = rnk1.team \
           and res.date between rnk1.from_date and rnk1.to_date \
       inner join fifa_rankings rnk2 \
           on res.away_team = rnk2.team \
           and res.date between rnk2.from_date and rnk2.to_date  ; '

In [36]:
project_df = pysqldf(sql)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_rank,home_rank_pts,away_rank,away_rank_pts
0,2010-05-26 00:00:00.000000,Hungary,Italy,6,1,Friendly,Budapest,Hungary,0,57,565,5,1184
1,2010-05-27 00:00:00.000000,Argentina,Chile,3,1,Friendly,Buenos Aires,Argentina,0,7,1076,18,888
2,2010-05-29 00:00:00.000000,Chile,Uruguay,0,3,Friendly,Buenos Aires,Argentina,1,18,888,16,899
3,2010-06-05 00:00:00.000000,Argentina,Chile,5,1,Friendly,Buenos Aires,Argentina,0,7,1076,18,888
4,2010-06-12 00:00:00.000000,Argentina,Uruguay,4,1,Friendly,Buenos Aires,Argentina,0,7,1076,16,899


In [38]:
project_df.tail()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_rank,home_rank_pts,away_rank,away_rank_pts
6652,2018-07-07 00:00:00.000000,Russia,Croatia,2,2,FIFA World Cup,Sochi,Russia,0,70,457,20,945
6653,2018-07-10 00:00:00.000000,France,Belgium,1,0,FIFA World Cup,St. Petersburg,Russia,1,7,1198,3,1298
6654,2018-07-11 00:00:00.000000,Croatia,England,2,1,FIFA World Cup,Moscow,Russia,1,20,945,12,1051
6655,2018-07-14 00:00:00.000000,Belgium,England,2,0,FIFA World Cup,St. Petersburg,Russia,1,3,1298,12,1051
6656,2018-07-15 00:00:00.000000,France,Croatia,4,2,FIFA World Cup,Moscow,Russia,1,7,1198,20,945


In [26]:
project_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6657 entries, 0 to 6656
Data columns (total 13 columns):
date             6657 non-null object
home_team        6657 non-null object
away_team        6657 non-null object
home_score       6657 non-null int64
away_score       6657 non-null int64
tournament       6657 non-null object
city             6657 non-null object
country          6657 non-null object
neutral          6657 non-null int64
home_rank        6657 non-null int64
home_rank_pts    6657 non-null int64
away_rank        6657 non-null int64
away_rank_pts    6657 non-null int64
dtypes: int64(7), object(6)
memory usage: 676.2+ KB


## 2 - Add more columns

In [39]:
# add diff columns
project_df['rank_diff'] = project_df['home_rank'] - project_df['away_rank']
project_df['rank_pts_diff'] = project_df['home_rank_pts'] - project_df['away_rank_pts']

In [40]:
# add outcome columns
score_diff = project_df['home_score'] - project_df['away_score']

In [41]:
def get_match_outcome(score_diff):
    if score_diff > 0:
        outcome = 'W'
    elif score_diff < 0:
        outcome = 'L'
    else:
        outcome = 'D'
    return outcome

In [42]:
def get_match_outcome_num(score_diff):
    if score_diff > 0:
        outcome = 0
    elif score_diff < 0:
        outcome = 2
    else:
        outcome = 1
    return outcome

In [43]:
project_df['outcome'] = score_diff.apply(get_match_outcome)
project_df['outcome_num'] = score_diff.apply(get_match_outcome_num)

In [44]:
# add other fields for testing
project_df['home_game'] =  1 * (project_df['home_team'] == project_df['country'])
project_df['away_game'] = 1 * (project_df['away_team'] == project_df['country'])

In [45]:
project_df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_rank,home_rank_pts,away_rank,away_rank_pts,rank_diff,rank_pts_diff,outcome,outcome_num,home_game,away_game
0,2010-05-26 00:00:00.000000,Hungary,Italy,6,1,Friendly,Budapest,Hungary,0,57,565,5,1184,52,-619,W,0,1,0
1,2010-05-27 00:00:00.000000,Argentina,Chile,3,1,Friendly,Buenos Aires,Argentina,0,7,1076,18,888,-11,188,W,0,1,0
2,2010-05-29 00:00:00.000000,Chile,Uruguay,0,3,Friendly,Buenos Aires,Argentina,1,18,888,16,899,2,-11,L,2,0,0
3,2010-06-05 00:00:00.000000,Argentina,Chile,5,1,Friendly,Buenos Aires,Argentina,0,7,1076,18,888,-11,188,W,0,1,0
4,2010-06-12 00:00:00.000000,Argentina,Uruguay,4,1,Friendly,Buenos Aires,Argentina,0,7,1076,16,899,-9,177,W,0,1,0


In [46]:
project_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6657 entries, 0 to 6656
Data columns (total 19 columns):
date             6657 non-null object
home_team        6657 non-null object
away_team        6657 non-null object
home_score       6657 non-null int64
away_score       6657 non-null int64
tournament       6657 non-null object
city             6657 non-null object
country          6657 non-null object
neutral          6657 non-null int64
home_rank        6657 non-null int64
home_rank_pts    6657 non-null int64
away_rank        6657 non-null int64
away_rank_pts    6657 non-null int64
rank_diff        6657 non-null int64
rank_pts_diff    6657 non-null int64
outcome          6657 non-null object
outcome_num      6657 non-null int64
home_game        6657 non-null int64
away_game        6657 non-null int64
dtypes: int64(12), object(7)
memory usage: 988.2+ KB


In [47]:
project_df.set_index('date',inplace=True,drop=True)

In [48]:
project_df

Unnamed: 0_level_0,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_rank,home_rank_pts,away_rank,away_rank_pts,rank_diff,rank_pts_diff,outcome,outcome_num,home_game,away_game
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2010-05-26 00:00:00.000000,Hungary,Italy,6,1,Friendly,Budapest,Hungary,0,57,565,5,1184,52,-619,W,0,1,0
2010-05-27 00:00:00.000000,Argentina,Chile,3,1,Friendly,Buenos Aires,Argentina,0,7,1076,18,888,-11,188,W,0,1,0
2010-05-29 00:00:00.000000,Chile,Uruguay,0,3,Friendly,Buenos Aires,Argentina,1,18,888,16,899,2,-11,L,2,0,0
2010-06-05 00:00:00.000000,Argentina,Chile,5,1,Friendly,Buenos Aires,Argentina,0,7,1076,18,888,-11,188,W,0,1,0
2010-06-12 00:00:00.000000,Argentina,Uruguay,4,1,Friendly,Buenos Aires,Argentina,0,7,1076,16,899,-9,177,W,0,1,0
2010-08-15 00:00:00.000000,Uruguay,Argentina,3,1,Copa Lipton,Montevideo,Uruguay,0,6,1152,5,1288,1,-136,W,0,1,0
2010-09-11 00:00:00.000000,Chile,Argentina,0,3,Friendly,Viña del Mar,Chile,0,10,988,5,1288,5,-300,L,2,1,0
2010-09-11 00:00:00.000000,Norway,Sweden,0,4,Friendly,Kristiania,Norway,0,22,878,35,747,-13,131,L,2,1,0
2010-10-16 00:00:00.000000,Germany,Netherlands,1,2,Friendly,Kleve,Germany,0,4,1464,2,1659,2,-195,L,2,1,0
2010-11-06 00:00:00.000000,Hungary,Austria,3,0,Friendly,Budapest,Hungary,0,62,534,60,536,2,-2,W,0,1,0


In [49]:
project_df.to_csv('project_data.csv')