In [1]:
# Import our dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
#load data
history = pd.read_csv('data/results.csv')

In [3]:
history.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [4]:
#add a winning_team column to show which team won
winner = []
for i in range (len(history['home_team'])):
    if history['home_score'][i] > history['away_score'][i]:
        winner.append(history['home_team'][i])
    elif history['home_score'][i] < history['away_score'][i]:
        winner.append(history['away_team'][i])
    else:
        winner.append('draw')
history['winning_team']=winner
history

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,draw
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,England
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Scotland
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,draw
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Scotland
...,...,...,...,...,...,...,...,...,...,...
43747,2022-06-14,Moldova,Andorra,2,1,UEFA Nations League,Chișinău,Moldova,False,Moldova
43748,2022-06-14,Liechtenstein,Latvia,0,2,UEFA Nations League,Vaduz,Liechtenstein,False,Latvia
43749,2022-06-14,Chile,Ghana,0,0,Kirin Cup,Suita,Japan,True,draw
43750,2022-06-14,Japan,Tunisia,0,3,Kirin Cup,Suita,Japan,False,Tunisia


In [5]:
#add a goal difference column
history['goal_difference']= np.absolute(history['home_score']-history['away_score'])

history.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team,goal_difference
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,draw,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,England,2
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Scotland,1
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,draw,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Scotland,3


In [12]:
#limit the dataset to the relevant teams playing in the 2022 world cup
teams_2022 = ['Qatar', 'Netherlands', 'Senegal', 'Ecuador', 
            'England', 'United States', 'Wales', 'Iran', 
            'Argentina', 'Poland', 'Mexico', 'Saudi Arabia', 
            'France', 'Denmark', 'Tunisia', 'Australia', 
            'Germany', 'Spain', 'Japan', 'Costa Rica', 
            'Belgium', 'Croatia', 'Canada', 'Morocco', 
            'Brazil', 'Switzerland', 'Serbia', 'Cameroon', 
            'Portugal', 'Uruguay', 'Ghana', 'South Korea']


In [7]:
#only need data on relevant teams
history = history[(history['home_team'].isin(teams_2022)) | (history['away_team'].isin(teams_2022))]
history

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team,goal_difference
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,draw,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,England,2
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Scotland,1
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,draw,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Scotland,3
...,...,...,...,...,...,...,...,...,...,...,...
43738,2022-06-14,Germany,Italy,5,2,UEFA Nations League,Mönchengladbach,Germany,False,Germany,3
43739,2022-06-14,Netherlands,Wales,3,2,UEFA Nations League,Rotterdam,Netherlands,False,Netherlands,1
43740,2022-06-14,Poland,Belgium,0,1,UEFA Nations League,Warsaw,Poland,False,Belgium,1
43749,2022-06-14,Chile,Ghana,0,0,Kirin Cup,Suita,Japan,True,draw,0


In [8]:
#create a year column
year = []
for row in history['date']:
    year.append(int(row[:4]))
history['year'] = year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  history['year'] = year


In [9]:
#drop games before 1930
history = history[history.year >= 1930]
history.head(3000)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team,goal_difference,year
1266,1930-01-01,Spain,Czechoslovakia,1,0,Friendly,Barcelona,Spain,False,Spain,1,1930
1267,1930-01-12,Portugal,Czechoslovakia,1,0,Friendly,Lisbon,Portugal,False,Portugal,1,1930
1269,1930-02-01,Northern Ireland,Wales,7,0,British Championship,Belfast,Northern Ireland,False,Northern Ireland,7,1930
1270,1930-02-09,Italy,Switzerland,4,2,Friendly,Rome,Italy,False,Italy,2,1930
1273,1930-02-23,Portugal,France,2,0,Friendly,Porto,Portugal,False,Portugal,2,1930
...,...,...,...,...,...,...,...,...,...,...,...,...
7189,1969-05-27,Denmark,Republic of Ireland,2,0,FIFA World Cup qualification,Copenhagen,Denmark,False,Denmark,2,1969
7190,1969-05-28,Chile,Argentina,1,1,Friendly,Santiago,Chile,False,draw,0,1969
7191,1969-06-01,Mexico,England,0,0,Friendly,Mexico City,Mexico,False,draw,0,1969
7193,1969-06-03,Mexico,England,0,4,Friendly,Guadalajara,Mexico,False,England,4,1969


In [10]:
#history = history[history.'tournament' == "FIFA World Cup qualification"| (history.'away_team']== "FIFA World Cup")]
wc = ["FIFA World Cup qualification", "FIFA World Cup"]
history = history[(history['tournament'].isin(wc)) | (history['tournament'].isin(wc))]
history

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team,goal_difference,year
1314,1930-07-13,Belgium,United States,0,3,FIFA World Cup,Montevideo,Uruguay,True,United States,3,1930
1315,1930-07-13,France,Mexico,4,1,FIFA World Cup,Montevideo,Uruguay,True,France,3,1930
1316,1930-07-14,Brazil,Yugoslavia,1,2,FIFA World Cup,Montevideo,Uruguay,True,Yugoslavia,1,1930
1318,1930-07-15,Argentina,France,1,0,FIFA World Cup,Montevideo,Uruguay,True,Argentina,1,1930
1319,1930-07-16,Chile,Mexico,3,0,FIFA World Cup,Montevideo,Uruguay,True,Chile,3,1930
...,...,...,...,...,...,...,...,...,...,...,...,...
43441,2022-03-30,Costa Rica,United States,2,0,FIFA World Cup qualification,San José,Costa Rica,False,Costa Rica,2,2022
43442,2022-03-30,Panama,Canada,1,0,FIFA World Cup qualification,Panama City,Panama,False,Panama,1,2022
43538,2022-06-05,Wales,Ukraine,1,0,FIFA World Cup qualification,Cardiff,Wales,False,Wales,1,2022
43704,2022-06-13,Australia,Peru,0,0,FIFA World Cup qualification,Al Rayyan,Qatar,True,draw,0,2022


In [11]:
# #drop columns that do not affect match outcomes
# # #likely, the date, home_score, away_score, tournament, city, country, neutral, and match_year should not make a difference
# df_matches = df_matches.drop(['date', 'home_score', 'away_score', 'tournament', 'city', 'country', 'neutral','year'], axis=1)
# df_matches

NameError: name 'df_matches' is not defined

In [None]:
#Building the model
#Target: The winning team column will show "2" if the home team has won, "1" if it was a tie, and "0" if the away team has won
#The model will be build to predict the "winning_team"

df_matches = df_matches.reset_index(drop=True)
df_matches.loc[df_matches.winning_team == df_matches.home_team,'winning_team']=2
df_matches.loc[df_matches.winning_team == 'draw', 'winning_team']=1
df_matches.loc[df_matches.winning_team == df_matches.away_team, 'winning_team']=0

df_matches

In [None]:
final = pd.get_dummies(df_matches, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# Separate X and y sets
X = final.drop(['winning_team'], axis=1)
y = final["winning_team"]
y=y.astype('int')

# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

score = classifier.score(X_train, y_train)
score2 = classifier.score(X_test, y_test)
print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))

In [None]:
#load data
rank = pd.read_csv('data/fifa_rankings.csv', encoding='latin-1')
rank

In [None]:
schedule = pd.read_csv('Data/schedule.csv')

# Create new columns with ranking position of each team
schedule.insert(1, 'first_position', schedule['Home Team'].map(rank.set_index('Country')['Rank']))
schedule.insert(2, 'second_position', schedule['Away Team'].map(rank.set_index('Country')['Rank']))

# We only need the group stage games, so we have to slice the dataset
schedule = schedule.iloc[:48, :]
schedule

In [None]:
# Loop to add teams to new prediction dataset based on the ranking position of each team
pred_set= []
for index, row in schedule.iterrows():
    if row['first_position'] < row['second_position']:
        pred_set.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
    else:
        pred_set.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})
        
pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set

pred_set.head()

In [None]:
# Get dummy variables and drop winning_team column
pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# Add missing columns compared to the model's training dataset
missing_cols = set(final.columns) - set(pred_set.columns)
for c in missing_cols:
    pred_set[c] = 0
pred_set = pred_set[final.columns]

# Remove winning team column
pred_set = pred_set.drop(['winning_team'], axis=1)

pred_set.head()

In [None]:
#group matches
predictions = classifier.predict(pred_set)
for i in range(schedule.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    if predictions[i] == 2:
        print("Winner: " + backup_pred_set.iloc[i, 1])
    elif predictions[i] == 1:
        print("Draw")
    elif predictions[i] == 0:
        print("Winner: " + backup_pred_set.iloc[i, 0])
    print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
    print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1]))
    print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
    print("")

In [None]:
# List of group stage qualifiers (round of 16)
matches = [('Qatar', 'Wales'),
            ('USA', 'Ecuador'),
            ('Saudi Arabia', 'Australia'),
            ('Tunisia', 'Poland'),
            ('Costa Rica', 'Morocco'),
            ('Canada', 'Japan'),
            ('Cameroon', 'Korea Republic'),
            ('Ghana', 'Serbia')]

In [None]:
def clean_and_predict(matches, rank, final, classifier):

    # Initialization of auxiliary list for data cleaning
    positions = []

    # Loop to retrieve each team's position according to FIFA ranking
    for match in matches:
        positions.append(rank.loc[rank['Country'] == match[0],'Rank'].iloc[0])
        positions.append(rank.loc[rank['Country'] == match[1],'Rank'].iloc[0])
    
    # Creating the DataFrame for prediction
    pred_set = []

    # Initializing iterators for while loop
    i = 0
    j = 0

    # 'i' will be the iterator for the 'positions' list, and 'j' for the list of matches (list of tuples)
    while i < len(positions):
        dict1 = {}

        # If position of first team is better, he will be the 'home' team, and vice-versa
        if positions[i] < positions[i + 1]:
            dict1.update({'home_team': matches[j][0], 'away_team': matches[j][1]})
        else:
            dict1.update({'home_team': matches[j][1], 'away_team': matches[j][0]})

        # Append updated dictionary to the list, that will later be converted into a DataFrame
        pred_set.append(dict1)
        i += 2
        j += 1

    # Convert list into DataFrame
    pred_set = pd.DataFrame(pred_set)
    backup_pred_set = pred_set

    # Get dummy variables and drop winning_team column
    pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

    # Add missing columns compared to the model's training dataset
    missing_cols2 = set(final.columns) - set(pred_set.columns)
    for c in missing_cols2:
        pred_set[c] = 0
    pred_set = pred_set[final.columns]

    # Remove winning team column
    pred_set = pred_set.drop(['winning_team'], axis=1)

    # Predict!
    predictions = classifier.predict(pred_set)
    for i in range(len(pred_set)):
        print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
        if predictions[i] == 2:
            print("Winner: " + backup_pred_set.iloc[i, 1])
        elif predictions[i] == 1:
            print("Draw")
        elif predictions[i] == 0:
            print("Winner: " + backup_pred_set.iloc[i, 0])
        print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ' , '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
        print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1])) 
        print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
        print("")

In [None]:
clean_and_predict(group, rank, final, classifier)

In [None]:
# List of matches
quarters = [('Portugal', 'France'),
            ('Spain', 'Argentina'),
            ('Brazil', 'England'),
            ('Germany', 'Belgium')]

In [None]:
clean_and_predict(quarters, rank, final, classifier)

In [None]:
# List of matches
semi = [('Portugal', 'Spain'),
        ('England', 'Belgium')]

In [None]:
clean_and_predict(semi, rank, final, classifier)

In [None]:
# Finals
finals = [('Portugal', 'Belgium')]

In [None]:
clean_and_predict(finals, rank, final, classifier)