In [22]:
# Import our dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [23]:
#load data
history = pd.read_csv('data/results.csv')

In [24]:
history.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [25]:
#add a winning_team column to show which team won
winner = []
for i in range (len(history['home_team'])):
    if history['home_score'][i] > history['away_score'][i]:
        winner.append(history['home_team'][i])
    elif history['home_score'][i] < history['away_score'][i]:
        winner.append(history['away_team'][i])
    else:
        winner.append('draw')
history['winning_team']=winner
history

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,draw
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,England
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Scotland
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,draw
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Scotland
...,...,...,...,...,...,...,...,...,...,...
43747,2022-06-14,Moldova,Andorra,2,1,UEFA Nations League,Chișinău,Moldova,False,Moldova
43748,2022-06-14,Liechtenstein,Latvia,0,2,UEFA Nations League,Vaduz,Liechtenstein,False,Latvia
43749,2022-06-14,Chile,Ghana,0,0,Kirin Cup,Suita,Japan,True,draw
43750,2022-06-14,Japan,Tunisia,0,3,Kirin Cup,Suita,Japan,False,Tunisia


In [26]:
#add a goal difference column
history['goal_difference']= np.absolute(history['home_score']-history['away_score'])

history.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team,goal_difference
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,draw,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,England,2
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Scotland,1
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,draw,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Scotland,3


In [27]:
#limit the dataset to the relevant teams playing in the 2022 world cup
teams_2022 = ['Qatar', 'Netherlands', 'Senegal', 'Ecuador', 
            'England', 'USA', 'Wales', 'Iran', 
            'Argentina', 'Poland', 'Mexico', 'Saudi Arabia', 
            'France', 'Denmark', 'Tunisia', 'Australia', 
            'Germany', 'Spain', 'Japan', 'Costa Rica', 
            'Belgium', 'Croatia', 'Canada', 'Morocco', 
            'Brazil', 'Switzerland', 'Serbia', 'Cameroon', 
            'Portugal', 'Uruguay', 'Ghana', 'Korea Republic']


In [28]:
#only need data on relevant teams
history = history[(history['home_team'].isin(teams_2022)) | (history['away_team'].isin(teams_2022))]
history

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team,goal_difference
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False,draw,0
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False,England,2
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False,Scotland,1
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False,draw,0
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False,Scotland,3
...,...,...,...,...,...,...,...,...,...,...,...
43738,2022-06-14,Germany,Italy,5,2,UEFA Nations League,Mönchengladbach,Germany,False,Germany,3
43739,2022-06-14,Netherlands,Wales,3,2,UEFA Nations League,Rotterdam,Netherlands,False,Netherlands,1
43740,2022-06-14,Poland,Belgium,0,1,UEFA Nations League,Warsaw,Poland,False,Belgium,1
43749,2022-06-14,Chile,Ghana,0,0,Kirin Cup,Suita,Japan,True,draw,0


In [29]:
#create a year column
year = []
for row in history['date']:
    year.append(int(row[:4]))
history['year'] = year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  history['year'] = year


In [30]:
#drop games before 1930
df_matches = history[history.year >= 1930]
df_matches

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_team,goal_difference,year
1266,1930-01-01,Spain,Czechoslovakia,1,0,Friendly,Barcelona,Spain,False,Spain,1,1930
1267,1930-01-12,Portugal,Czechoslovakia,1,0,Friendly,Lisbon,Portugal,False,Portugal,1,1930
1269,1930-02-01,Northern Ireland,Wales,7,0,British Championship,Belfast,Northern Ireland,False,Northern Ireland,7,1930
1270,1930-02-09,Italy,Switzerland,4,2,Friendly,Rome,Italy,False,Italy,2,1930
1273,1930-02-23,Portugal,France,2,0,Friendly,Porto,Portugal,False,Portugal,2,1930
...,...,...,...,...,...,...,...,...,...,...,...,...
43738,2022-06-14,Germany,Italy,5,2,UEFA Nations League,Mönchengladbach,Germany,False,Germany,3,2022
43739,2022-06-14,Netherlands,Wales,3,2,UEFA Nations League,Rotterdam,Netherlands,False,Netherlands,1,2022
43740,2022-06-14,Poland,Belgium,0,1,UEFA Nations League,Warsaw,Poland,False,Belgium,1,2022
43749,2022-06-14,Chile,Ghana,0,0,Kirin Cup,Suita,Japan,True,draw,0,2022


In [40]:
#drop columns that do not affect match outcomes
#likely, the date, home_score, away_score, tournament, city, country, neutral, and match_year should not make a difference
df_matches = df_matches.drop(['date', 'home_score', 'away_score', 'tournament', 'city', 'country', 'neutral','year'], axis=1)
df_matches

KeyError: "['date' 'home_score' 'away_score' 'tournament' 'city' 'country' 'neutral'\n 'year'] not found in axis"

In [41]:
#Building the model
#Target: The winning team column will show "2" if the home team has won, "1" if it was a tie, and "0" if the away team has won
#The model will be build to predict the "winning_team"

df_matches = df_matches.reset_index(drop=True)
df_matches.loc[df_matches.winning_team == df_matches.home_team,'winning_team']=2
df_matches.loc[df_matches.winning_team == 'draw', 'winning_team']=1
df_matches.loc[df_matches.winning_team == df_matches.away_team, 'winning_team']=0

df_matches

Unnamed: 0,home_team,away_team,winning_team
0,Spain,Czechoslovakia,2
1,Portugal,Czechoslovakia,2
2,Northern Ireland,Wales,2
3,Italy,Switzerland,2
4,Portugal,France,2
...,...,...,...
16530,Germany,Italy,2
16531,Netherlands,Wales,2
16532,Poland,Belgium,0
16533,Chile,Ghana,1


In [42]:
final = pd.get_dummies(df_matches, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# Separate X and y sets
X = final.drop(['winning_team'], axis=1)
y = final["winning_team"]
y=y.astype('int')

# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [45]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

score = classifier.score(X_train, y_train)
score2 = classifier.score(X_test, y_test)
print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))

Training set accuracy:  0.586
Test set accuracy:  0.572


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [57]:
#load data
rank = pd.read_csv('data/fifa_rankings.csv', encoding='latin-1')
rank

Unnamed: 0,Rank,Country,Points
0,1,Brazil,1837.56
1,2,Belgium,1821.92
2,3,Argentina,1770.65
3,4,France,1764.85
4,5,England,1737.46
...,...,...,...
206,207,Sri Lanka,825.25
207,208,US Virgin Islands,823.97
208,209,British Virgin Islands,809.32
209,210,Anguilla,790.74


In [60]:
schedule = pd.read_csv('Data/schedule.csv')

# Create new columns with ranking position of each team
schedule.insert(1, 'first_position', schedule['Home Team'].map(rank.set_index('Country')['Rank']))
schedule.insert(2, 'second_position', schedule['Away Team'].map(rank.set_index('Country')['Rank']))

# We only need the group stage games, so we have to slice the dataset
schedule = schedule.iloc[:48, :]
schedule

Unnamed: 0,Match Number,first_position,second_position,Round Number,Date,Location,Home Team,Away Team,Group,Result
0,2,18.0,8.0,1,21/11/2022 10:00,Al Thumama Stadium,Senegal,Netherlands,Group A,
1,3,5.0,23.0,1,21/11/2022 13:00,Khalifa International Stadium,England,Iran,Group B,
2,1,49.0,44.0,1,21/11/2022 16:00,Al Bayt Stadium,Qatar,Ecuador,Group A,
3,4,,19.0,1,21/11/2022 19:00,Ahmad Bin Ali Stadium,USA,Wales,Group B,
4,8,3.0,53.0,1,22/11/2022 10:00,Lusail Stadium,Argentina,Saudi Arabia,Group C,
5,6,10.0,30.0,1,22/11/2022 13:00,Education City Stadium,Denmark,Tunisia,Group D,
6,7,12.0,26.0,1,22/11/2022 16:00,Stadium 974,Mexico,Poland,Group C,
7,5,4.0,39.0,1,22/11/2022 19:00,Al Janoub Stadium,France,Australia,Group D,
8,12,22.0,15.0,1,23/11/2022 10:00,Al Bayt Stadium,Morocco,Croatia,Group F,
9,11,11.0,24.0,1,23/11/2022 13:00,Khalifa International Stadium,Germany,Japan,Group E,


In [64]:
# Loop to add teams to new prediction dataset based on the ranking position of each team
pred_set= []
for index, row in schedule.iterrows():
    if row['first_position'] < row['second_position']:
        pred_set.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
    else:
        pred_set.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})
        
pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set

pred_set.head()

Unnamed: 0,home_team,away_team,winning_team
0,Netherlands,Senegal,
1,England,Iran,
2,Ecuador,Qatar,
3,Wales,USA,
4,Argentina,Saudi Arabia,


In [66]:
# Get dummy variables and drop winning_team column
pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# Add missing columns compared to the model's training dataset
missing_cols = set(final.columns) - set(pred_set.columns)
for c in missing_cols:
    pred_set[c] = 0
pred_set = pred_set[final.columns]

# Remove winning team column
pred_set = pred_set.drop(['winning_team'], axis=1)

pred_set.head()

  pred_set[c] = 0


Unnamed: 0,home_team_Afghanistan,home_team_Albania,home_team_Algeria,home_team_Andalusia,home_team_Andorra,home_team_Angola,home_team_Argentina,home_team_Armenia,home_team_Aruba,home_team_Australia,...,away_team_Vanuatu,away_team_Venezuela,away_team_Vietnam,away_team_Vietnam Republic,away_team_Wales,away_team_Yemen,away_team_Yemen DPR,away_team_Yugoslavia,away_team_Zambia,away_team_Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
#group matches
predictions = classifier.predict(pred_set)
for i in range(schedule.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    if predictions[i] == 2:
        print("Winner: " + backup_pred_set.iloc[i, 1])
    elif predictions[i] == 1:
        print("Draw")
    elif predictions[i] == 0:
        print("Winner: " + backup_pred_set.iloc[i, 0])
    print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
    print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1]))
    print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
    print("")

Senegal and Netherlands
Winner: Senegal
Probability of Senegal winning:  0.591
Probability of Draw:  0.276
Probability of Netherlands winning:  0.133

Iran and England
Winner: Iran
Probability of Iran winning:  0.593
Probability of Draw:  0.290
Probability of England winning:  0.118

Qatar and Ecuador
Winner: Qatar
Probability of Qatar winning:  0.615
Probability of Draw:  0.228
Probability of Ecuador winning:  0.156

USA and Wales
Winner: USA
Probability of USA winning:  0.657
Probability of Draw:  0.189
Probability of Wales winning:  0.154

Saudi Arabia and Argentina
Winner: Saudi Arabia
Probability of Saudi Arabia winning:  0.774
Probability of Draw:  0.166
Probability of Argentina winning:  0.060

Tunisia and Denmark
Winner: Tunisia
Probability of Tunisia winning:  0.552
Probability of Draw:  0.264
Probability of Denmark winning:  0.184

Poland and Mexico
Winner: Poland
Probability of Poland winning:  0.421
Probability of Draw:  0.299
Probability of Mexico winning:  0.280

Australi

In [85]:
# List of group stage qualifiers (round of 16)
matches = [('Qatar', 'Wales'),
            ('USA', 'Ecuador'),
            ('Saudi Arabia', 'Australia'),
            ('Tunisia', 'Poland'),
            ('Costa Rica', 'Morocco'),
            ('Canada', 'Japan'),
            ('Cameroon', 'Korea Republic'),
            ('Ghana', 'Serbia')]

In [86]:
def clean_and_predict(matches, rank, final, classifier):

    # Initialization of auxiliary list for data cleaning
    positions = []

    # Loop to retrieve each team's position according to FIFA ranking
    for match in matches:
        positions.append(rank.loc[rank['Country'] == match[0],'Rank'].iloc[0])
        positions.append(rank.loc[rank['Country'] == match[1],'Rank'].iloc[0])
    
    # Creating the DataFrame for prediction
    pred_set = []

    # Initializing iterators for while loop
    i = 0
    j = 0

    # 'i' will be the iterator for the 'positions' list, and 'j' for the list of matches (list of tuples)
    while i < len(positions):
        dict1 = {}

        # If position of first team is better, he will be the 'home' team, and vice-versa
        if positions[i] < positions[i + 1]:
            dict1.update({'home_team': matches[j][0], 'away_team': matches[j][1]})
        else:
            dict1.update({'home_team': matches[j][1], 'away_team': matches[j][0]})

        # Append updated dictionary to the list, that will later be converted into a DataFrame
        pred_set.append(dict1)
        i += 2
        j += 1

    # Convert list into DataFrame
    pred_set = pd.DataFrame(pred_set)
    backup_pred_set = pred_set

    # Get dummy variables and drop winning_team column
    pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

    # Add missing columns compared to the model's training dataset
    missing_cols2 = set(final.columns) - set(pred_set.columns)
    for c in missing_cols2:
        pred_set[c] = 0
    pred_set = pred_set[final.columns]

    # Remove winning team column
    pred_set = pred_set.drop(['winning_team'], axis=1)

    # Predict!
    predictions = classifier.predict(pred_set)
    for i in range(len(pred_set)):
        print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
        if predictions[i] == 2:
            print("Winner: " + backup_pred_set.iloc[i, 1])
        elif predictions[i] == 1:
            print("Draw")
        elif predictions[i] == 0:
            print("Winner: " + backup_pred_set.iloc[i, 0])
        print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ' , '%.3f'%(classifier.predict_proba(pred_set)[i][2]))
        print('Probability of Draw: ', '%.3f'%(classifier.predict_proba(pred_set)[i][1])) 
        print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(classifier.predict_proba(pred_set)[i][0]))
        print("")

In [87]:
clean_and_predict(group, rank, final, classifier)

IndexError: single positional indexer is out-of-bounds

In [88]:
# List of matches
quarters = [('Portugal', 'France'),
            ('Spain', 'Argentina'),
            ('Brazil', 'England'),
            ('Germany', 'Belgium')]

In [89]:
clean_and_predict(quarters, rank, final, classifier)

Portugal and France
Winner: Portugal
Probability of Portugal winning:  0.564
Probability of Draw:  0.203
Probability of France winning:  0.234

Spain and Argentina
Winner: Spain
Probability of Spain winning:  0.403
Probability of Draw:  0.383
Probability of Argentina winning:  0.214

England and Brazil
Winner: England
Probability of England winning:  0.533
Probability of Draw:  0.245
Probability of Brazil winning:  0.222

Germany and Belgium
Winner: Belgium
Probability of Germany winning:  0.272
Probability of Draw:  0.205
Probability of Belgium winning:  0.522



  pred_set[c] = 0


In [90]:
# List of matches
semi = [('Portugal', 'Spain'),
        ('England', 'Belgium')]

In [91]:
clean_and_predict(semi, rank, final, classifier)

Portugal and Spain
Winner: Portugal
Probability of Portugal winning:  0.627
Probability of Draw:  0.213
Probability of Spain winning:  0.160

England and Belgium
Winner: Belgium
Probability of England winning:  0.304
Probability of Draw:  0.238
Probability of Belgium winning:  0.458



  pred_set[c] = 0


In [92]:
# Finals
finals = [('Portugal', 'Belgium')]

In [94]:
clean_and_predict(finals, rank, final, classifier)

Portugal and Belgium
Winner: Portugal
Probability of Portugal winning:  0.466
Probability of Draw:  0.212
Probability of Belgium winning:  0.322



  pred_set[c] = 0
