In [1]:
# Import our dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
#load data
history = pd.read_csv('../Data/results.csv')

In [3]:
history.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [4]:
#add a winning_team column to show which team won
winner = []
for i in range (len(history['home_team'])):
    if history['home_score'][i] > history['away_score'][i]:
        winner.append(history['home_team'][i])
    elif history['home_score'][i] < history['away_score'][i]:
        winner.append(history['away_team'][i])
    else:
        winner.append('draw')
history['winning_team']=winner
history

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,draw
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,England
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Scotland
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,draw
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Scotland
...,...,...,...,...,...,...,...,...,...,...
43747,2022-06-14,Moldova,Andorra,2,1,UEFA Nations League,Chișinău,Moldova,False,Moldova
43748,2022-06-14,Liechtenstein,Latvia,0,2,UEFA Nations League,Vaduz,Liechtenstein,False,Latvia
43749,2022-06-14,Chile,Ghana,0,0,Kirin Cup,Suita,Japan,True,draw
43750,2022-06-14,Japan,Tunisia,0,3,Kirin Cup,Suita,Japan,False,Tunisia


In [5]:
#add a goal difference column
history['goal_difference']= np.absolute(history['home_score']-history['away_score'])

history.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team,goal_difference
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,draw,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,England,2
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Scotland,1
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,draw,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Scotland,3


In [6]:
#limit the dataset to the relevant teams playing in the 2022 world cup
teams_2022 = ['Qatar', 'Netherlands', 'Senegal', 'Ecuador', 
            'England', 'United States', 'Wales', 'Iran', 
            'Argentina', 'Poland', 'Mexico', 'Saudi Arabia', 
            'France', 'Denmark', 'Tunisia', 'Australia', 
            'Germany', 'Spain', 'Japan', 'Costa Rica', 
            'Belgium', 'Croatia', 'Canada', 'Morocco', 
            'Brazil', 'Switzerland', 'Serbia', 'Cameroon', 
            'Portugal', 'Uruguay', 'Ghana', 'South Korea']


In [7]:
#only need data on relevant teams
history = history[(history['home_team'].isin(teams_2022)) | (history['away_team'].isin(teams_2022))]
history

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team,goal_difference
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,draw,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,England,2
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Scotland,1
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,draw,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Scotland,3
...,...,...,...,...,...,...,...,...,...,...,...
43739,2022-06-14,Netherlands,Wales,3,2,UEFA Nations League,Rotterdam,Netherlands,False,Netherlands,1
43740,2022-06-14,Poland,Belgium,0,1,UEFA Nations League,Warsaw,Poland,False,Belgium,1
43749,2022-06-14,Chile,Ghana,0,0,Kirin Cup,Suita,Japan,True,draw,0
43750,2022-06-14,Japan,Tunisia,0,3,Kirin Cup,Suita,Japan,False,Tunisia,3


In [8]:
#create a year column
year = []
for row in history['date']:
    year.append(int(row[:4]))
history['year'] = year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  history['year'] = year


In [9]:
#drop games before 1930
history = history[history.year >= 1930]
history.head(3000)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team,goal_difference,year
1266,1930-01-01,Spain,Czechoslovakia,1,0,Friendly,Barcelona,Spain,False,Spain,1,1930
1267,1930-01-12,Portugal,Czechoslovakia,1,0,Friendly,Lisbon,Portugal,False,Portugal,1,1930
1269,1930-02-01,Northern Ireland,Wales,7,0,British Championship,Belfast,Northern Ireland,False,Northern Ireland,7,1930
1270,1930-02-09,Italy,Switzerland,4,2,Friendly,Rome,Italy,False,Italy,2,1930
1273,1930-02-23,Portugal,France,2,0,Friendly,Porto,Portugal,False,Portugal,2,1930
...,...,...,...,...,...,...,...,...,...,...,...,...
7047,1968-12-01,Cameroon,Tunisia,4,2,Friendly,Douala,Cameroon,False,Cameroon,2,1968
7048,1968-12-01,Costa Rica,Jamaica,3,1,FIFA World Cup qualification,San José,Costa Rica,False,Costa Rica,2,1968
7050,1968-12-01,Ghana,Senegal,2,0,Friendly,Accra,Ghana,False,Ghana,2,1968
7053,1968-12-04,Chile,Argentina,2,1,Copa Carlos Dittborn,Santiago,Chile,False,Chile,1,1968


In [10]:
#history = history[history.'tournament' == "FIFA World Cup qualification"| (history.'away_team']== "FIFA World Cup")]
wc = ["FIFA World Cup qualification", "FIFA World Cup"]
wc_matches = history[(history['tournament'].isin(wc)) | (history['tournament'].isin(wc))]
wc_matches

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team,goal_difference,year
1314,1930-07-13,Belgium,United States,0,3,FIFA World Cup,Montevideo,Uruguay,True,United States,3,1930
1315,1930-07-13,France,Mexico,4,1,FIFA World Cup,Montevideo,Uruguay,True,France,3,1930
1316,1930-07-14,Brazil,Yugoslavia,1,2,FIFA World Cup,Montevideo,Uruguay,True,Yugoslavia,1,1930
1318,1930-07-15,Argentina,France,1,0,FIFA World Cup,Montevideo,Uruguay,True,Argentina,1,1930
1319,1930-07-16,Chile,Mexico,3,0,FIFA World Cup,Montevideo,Uruguay,True,Chile,3,1930
...,...,...,...,...,...,...,...,...,...,...,...,...
43441,2022-03-30,Costa Rica,United States,2,0,FIFA World Cup qualification,San José,Costa Rica,False,Costa Rica,2,2022
43442,2022-03-30,Panama,Canada,1,0,FIFA World Cup qualification,Panama City,Panama,False,Panama,1,2022
43538,2022-06-05,Wales,Ukraine,1,0,FIFA World Cup qualification,Cardiff,Wales,False,Wales,1,2022
43704,2022-06-13,Australia,Peru,0,0,FIFA World Cup qualification,Al Rayyan,Qatar,True,draw,0,2022


In [11]:
# saving to a csv
wc_matches.to_csv("wc_matches.csv")