## Mounting shared drive

In [None]:
from google.colab import drive

drive.mount('/content/drive')
path = '/content/drive/Shared drives/597ML_Project/597ML_Project/'

Mounted at /content/drive


## Importing necessary libraries, modules, and data files:

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker
import pickle
from tabulate import tabulate
import math
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn import svm
from sklearn.model_selection import train_test_split

In [None]:
# import csvs
results = pd.read_csv(path + 'results.csv')
fixtures = pd.read_csv(path + 'fifa-world-cup-2022-fixtures.csv')
ranking = pd.read_csv(path + 'new_rankings.csv', encoding = 'latin-1')

## Loading Saved Models

In [None]:
def saveModelsToDrive(filename, clf, drive = path + 'Models/'):
    pickle.dump(clf, open(drive + filename, 'wb'))

def loadModelsFromDrive(filename, drive = path + 'Models/'):
    return pickle.load(open(drive + filename, 'rb'))

In [None]:
filename = 'logistic_Monday_2.sav'
logistic = loadModelsFromDrive(filename)

filename2 = 'svm_Monday_2.sav'
svm_clf = loadModelsFromDrive(filename2)

# Exploratory data analysis

### Checking the number of columns and rows in each file

In [None]:
print('The data file results has these many columns and rows represented as (rows, columns):')
print(results.shape)
print('\nThe data file fixtures has these many columns and rows represented as (rows, columns):')
print(fixtures.shape)
print('\nThe data file ranking has these many columns and rows represented as (rows, columns):')
print(ranking.shape)

The data file results has these many columns and rows represented as (rows, columns):
(39654, 9)

The data file fixtures has these many columns and rows represented as (rows, columns):
(48, 3)

The data file ranking has these many columns and rows represented as (rows, columns):
(75, 3)


### Looking at a few values within each of the data files

In [None]:
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False


In [None]:
fixtures.head()

Unnamed: 0,Home Team,Away Team,Group
0,Senegal,Netherlands,Group A
1,England,Iran,Group B
2,Qatar,Ecuador,Group A
3,USA,Wales,Group B
4,Argentina,Saudi Arabia,Group C


In [None]:
ranking.head()

Unnamed: 0,Position,Team,Points
0,1,Belgium,1828
1,2,Brazil,1823
2,3,France,1786
3,4,Argentina,1767
4,5,England,1756


### As we can see, the results datafile has data from the year 1872 onwards, however, that data will not help us make a better prediction, infact it facts it will make our prediction worse. This dataset also has the data for teams which have not qualified for the 2022 world cup and hence their data should be omitted. Moreover the results datafile has the names of the teams playing and the score, but it doesn't have the name of the winning team or the goal difference, both of which we need. Therefore we have to change the dataset a little bit:

## Changing the results.csv dataset to include name of winning team and goal difference

In [None]:
winner = []

for i in range (len(results['home_team'])):
    if results ['home_score'][i] > results['away_score'][i]:
        winner.append(results['home_team'][i])
    elif results['home_score'][i] < results ['away_score'][i]:
        winner.append(results['away_team'][i])
    else:
        winner.append('Draw')

# adding winning team column
results['winning_team'] = winner

# adding goal difference column
results['goal_difference'] = np.absolute(results['home_score'] - results['away_score'])

## Changing the results.csv dataset to only show record of the teams qualified for this edition of the world cup and also to only show results from 2000 onwards

In [None]:
worldcup_teams = ['Qatar', 'Germany', 'Denmark', 'Korea Republic',
                  'Brazil', 'France', 'Belgium', 'Croatia',
                  'Spain', 'Serbia', 'England', 'Switzerland',
                  'Netherlands', 'Argentina', 'Iran', 'Japan',
                  'Saudi Arabia', 'Ecuador', 'Uruguay', 'Canada',
                  'Ghana', 'Senegal', 'Portugal', 'Poland',
                  'Tunisia', 'Morocco', 'Cameroon', 'USA',
                  'Mexico', 'Cota Rica', 'Wales', 'Australia']

df_teams_home = results[results['home_team'].isin(worldcup_teams)]
df_teams_away = results[results['away_team'].isin(worldcup_teams)]
df_teams = pd.concat((df_teams_home, df_teams_away))
df_teams.drop_duplicates()
df_teams.count()

year = []
for row in df_teams['date']:
    year.append(int(row[:4]))
df_teams['match_year'] = year
df_teams_2000 = df_teams[df_teams.match_year >= 2000]

df_teams_2000 = df_teams.drop(['date', 'home_score', 'away_score', 'tournament', 'city', 'country',
                               'goal_difference', 'match_year'], axis = 1)

### Since it is not very important to visualise the data, we will not be making statistical plots. We already have a pretty good idea of what the data looks like, we can continue onwards to building our model

## Giving points to the teams. 3 points for winning, 1 point for drawing, and 0 points for losing a game

In [None]:
df_teams_2000 = df_teams_2000.reset_index(drop=True)
df_teams_2000.loc[df_teams_2000.winning_team == df_teams_2000.home_team,'winning_team']=3
df_teams_2000.loc[df_teams_2000.winning_team == 'Draw', 'winning_team']=1
df_teams_2000.loc[df_teams_2000.winning_team == df_teams_2000.away_team, 'winning_team']=0

## Splitting the data into testing and training data, as well as cross validating

In [None]:
final = pd.get_dummies(df_teams_2000, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# Separating X and y sets
X = final.drop(['winning_team'], axis=1)
y = final["winning_team"]
y = y.astype('int')

# Separating training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) # Look at K-fold approach

In [None]:
from sklearn import svm

scoring = ['precision_macro', 'recall_macro']
clf = svm.SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(clf, X_train, y_train, scoring=scoring)
sorted(scores.keys())
scores['test_recall_macro']

array([0.45558379, 0.45635134, 0.44925555, 0.46130716, 0.45355348])

## SVM Model


In [None]:
from sklearn import svm

svm_clf = svm.SVC().fit(X_train, y_train)# svm.LinearSVC() for one-vs-the-rest

score = svm_clf.score(X_train, y_train)
score2 = svm_clf.score(X_test, y_test)

print('Training set accuracy ', '%.3f'%(score))
print('Test set accuracy ', '%.3f'%(score2))

saveModelsToDrive('svm_Monday_2.sav', svm_clf)

Training set accuracy  0.655
Test set accuracy  0.560


## Logistic Regression Model:

In [None]:
from sklearn.linear_model import LogisticRegression

final.head()
logistic = LogisticRegression(penalty='l2', max_iter = 1000)
logistic.fit(X_train, y_train)
score = logistic.score(X_train, y_train)
score2 = logistic.score(X_test, y_test)

print('Training set accuracy ', '%.3f'%(score))
print('Test set accuracy ', '%.3f'%(score2))

saveModelsToDrive('logistic_Monday_2.sav', logistic)

Training set accuracy  0.581
Test set accuracy  0.554


## Random Forest Model:

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier()
forest_clf.fit(X_train, y_train)

score = forest_clf.score(X_train, y_train)
score2 = forest_clf.score(X_test, y_test)

print('Training set accuracy ', '%.3f'%(score))
print('Test set accuracy ', '%.3f'%(score2))

saveModelsToDrive("RandomForest.sav", forest_clf)

Training set accuracy  0.707
Test set accuracy  0.543


## We will match predictions using the latest FIFA rankings and group stage planned match information. For any match scheduled to be played the higher ranked team according to the latest FIFA rankings will be the home team.

In [None]:
# List for storing the group stage games
pred_set = []

# Create new columns with ranking position of each team
fixtures.insert(1, 'first_position', fixtures['Home Team'].map(ranking.set_index('Team')['Position']))
fixtures.insert(2, 'second_position', fixtures['Away Team'].map(ranking.set_index('Team')['Position']))

# We only need the group stage games, so we have to slice the dataset
fixtures = fixtures.iloc[:48, :]
################################################################################
# MAKING PREDICTION
################################################################################

# Loop to add teams to new prediction dataset based on the ranking position of each team
for index, row in fixtures.iterrows():
    if row['first_position'] < row['second_position']:
        pred_set.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
    else:
        pred_set.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})

pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set

pred_set.head()

# Get dummy variables and drop winning_team column
pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# Add missing columns compared to the model's training dataset
missing_cols = set(final.columns) - set(pred_set.columns)
for i in missing_cols:
    pred_set[i] = 0
pred_set = pred_set[final.columns]

# Remove winning team column
pred_set = pred_set.drop(['winning_team'], axis=1)

pred_set.head()
winners = []
draws = []
predictions = svm_clf.predict(pred_set)
for i in range(fixtures.shape[0]):
    print('Match Number: ', i+1)
    print(backup_pred_set.iloc[i, 0] + ' and ' + backup_pred_set.iloc[i, 1])
    if predictions[i] == 3:
        print('Winner: ' + backup_pred_set.iloc[i, 0])
        winners.append(backup_pred_set.iloc[i, 0])

    elif predictions[i] == 1:
        print('Draw')
        draws.append(backup_pred_set.iloc[i, 0])
        draws.append(backup_pred_set.iloc[i, 1])

    elif predictions[i] == 0:
        print('Winner: ' + backup_pred_set.iloc[i, 1])
        winners.append(backup_pred_set.iloc[i, 1])

    print('')



Match Number:  1
Netherlands and Senegal
Winner: Netherlands

Match Number:  2
England and Iran
Winner: England

Match Number:  3
Ecuador and Qatar
Winner: Ecuador

Match Number:  4
USA and Wales
Winner: USA

Match Number:  5
Argentina and Saudi Arabia
Winner: Argentina

Match Number:  6
Denmark and Tunisia
Winner: Denmark

Match Number:  7
Mexico and Poland
Winner: Mexico

Match Number:  8
France and Australia
Winner: France

Match Number:  9
Croatia and Morocco
Winner: Croatia

Match Number:  10
Germany and Japan
Winner: Germany

Match Number:  11
Spain and Costa Rica
Winner: Spain

Match Number:  12
Belgium and Canada
Winner: Belgium

Match Number:  13
Switzerland and Cameroon
Winner: Switzerland

Match Number:  14
Korea Republic and Uruguay
Winner: Uruguay

Match Number:  15
Portugal and Ghana
Winner: Portugal

Match Number:  16
Brazil and Serbia
Winner: Brazil

Match Number:  17
Wales and Iran
Winner: Iran

Match Number:  18
Senegal and Qatar
Winner: Senegal

Match Number:  19
Net

## Outputting results of the group stage in a table:

In [None]:
def get_table(groupArray):
    groupTable = []
    for Team in groupArray:
        newArray = [Team, 0, 0, 0, 0]

        for winTeam in winners:
            if winTeam == Team:
                newArray[1] += 1

        for drawTeam in draws:
            if drawTeam == Team:
                newArray[2] += 1

        newArray[3] = 3-(newArray[1]+newArray[2])
        newArray[4] = 3*newArray[1]+newArray[2]

        groupTable.append(newArray)
    groupTable.sort(key = lambda groupTable: groupTable[4], reverse = True)
    groupTable.insert(0, ['Team Name', 'Wins', 'Draws', 'Losses', 'Points', 'Advanced to Group Stage?'])
    return groupTable

groups = {'Group A': ['Qatar', 'Ecuador', 'Senegal', 'Netherlands'],
          'Group B': ['England', 'Iran', 'USA', 'Wales'],
          'Group C': ['Argentina', 'Saudi Arabia', 'Mexico', 'Poland'],
          'Group D' : ['France', 'Australia', 'Denmark', 'Tunisia'],
          'Group E' : ['Spain', 'Germany', 'Japan', 'Costa Rica'],
          'Group F' : ['Belgium', 'Canada', 'Morocco', 'Croatia'],
          'Group G' : ['Brazil', 'Serbia', 'Switzerland', 'Cameroon'],
          'Group H' : ['Portugal', 'Ghana', 'Uruguay', 'Korea Republic'],
}
newerArray = []
count = 0
for x in groups:
    newerArray.append(get_table(groups[x]))
    print(x)
    print(tabulate(newerArray[count], headers='firstrow', tablefmt='fancy_grid'), '\n')
    count += 1

Group A
╒═════════════╤════════╤═════════╤══════════╤══════════╕
│ Team Name   │   Wins │   Draws │   Losses │   Points │
╞═════════════╪════════╪═════════╪══════════╪══════════╡
│ Netherlands │      3 │       0 │        0 │        9 │
├─────────────┼────────┼─────────┼──────────┼──────────┤
│ Senegal     │      2 │       0 │        1 │        6 │
├─────────────┼────────┼─────────┼──────────┼──────────┤
│ Ecuador     │      1 │       0 │        2 │        3 │
├─────────────┼────────┼─────────┼──────────┼──────────┤
│ Qatar       │      0 │       0 │        3 │        0 │
╘═════════════╧════════╧═════════╧══════════╧══════════╛ 

Group B
╒═════════════╤════════╤═════════╤══════════╤══════════╕
│ Team Name   │   Wins │   Draws │   Losses │   Points │
╞═════════════╪════════╪═════════╪══════════╪══════════╡
│ England     │      3 │       0 │        0 │        9 │
├─────────────┼────────┼─────────┼──────────┼──────────┤
│ Iran        │      1 │       1 │        1 │        4 │
├────────────

## Making a function to predict winners. This function will be used for all knockout stage matches:

In [None]:
def predict(matches, ranking, final, logistic):
    #Initialization of auxiliary list for data cleaning
    positions = []

    #Loop to retrieve each team's position according to FIFA ranking
    for match in matches:
        positions.append(ranking.loc[ranking['Team'] == match[0], 'Position'].iloc[0])
        positions.append(ranking.loc[ranking['Team'] == match[1], 'Position'].iloc[0])

    #Creating the DataFrame for prediction
    pred_set = []

    #initalizaing iterators for while loop
    i = 0
    j = 0

    # 'i' will be the iterator for the 'position' list, and 'j' for the list of matches (list of tuples)

    while i < len(positions):
        dict1 = {}

        # if position of first team is better, he will be the 'home' team, and vice-versa
        if positions[i] < positions[i + 1]:
            dict1.update({'home_team': matches[j][0], 'away_team': matches[j][1]})
        else:
            dict1.update({'home_team': matches[j][1], 'away_team': matches[j][0]})

        #Append updated dictionary to the list, that will later be converted into a DataFrame
        pred_set.append(dict1)
        i += 2
        j += 1

    #Covert list into DataFrame
    pred_set = pd.DataFrame(pred_set)
    backup_pred_set = pred_set

    #Get dummy variables and drop winning_team column
    pred_set = pd.get_dummies(pred_set, prefix = ['home_team', 'away_team'], columns = ['home_team', 'away_team'])

    #Add missing columns compared to the model's training dataset
    missing_cols2 = set(final.columns) - set(pred_set.columns)
    for c in missing_cols2:
        pred_set[c] = 0
    pred_set = pred_set[final.columns]

    #Remove winning team column
    pred_set = pred_set.drop(['winning_team'], axis=1)

    #List of winning teams
    winningTeams = []

    #Prediction
    predictions = logistic.predict(pred_set)
    for i in range(len(pred_set)):
        print(backup_pred_set.iloc[i,1] + ' and ' + backup_pred_set.iloc[i,0])
        if predictions[i] == 3:
            print('Winner: ' + backup_pred_set.iloc[i,0], '\n')
            winningTeams.append(backup_pred_set.iloc[i,0])
        elif predictions[i] == 1:
          pass
           # print('Draw')
        elif predictions[i] == 0:
            print('Winner: ' + backup_pred_set.iloc[i, 1], '\n')
            winningTeams.append(backup_pred_set.iloc[i,1])
        #print('Probability of ' + backup_pred_set.iloc[i,0] + ' winning: ', '%.3f'%(logistic.predict_proba(pred_set)[i][2]))
        #print('Probability of Draw: ', '%.3f'%(logistic.predict_proba(pred_set)[i][1]))
        #print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(logistic.predict_proba(pred_set)[i][0]))
        #print('')

    return winningTeams

## Making an array to represent which teams are playing against each other in the round of 16 and making predictions

In [None]:
# Making an array to represent
advancingTeams = []
for i in range(0,len(newerArray),2):
    x = (newerArray[i][1][0], newerArray[i+1][2][0])
    y = (newerArray[i][2][0], newerArray[i+1][1][0])
    advancingTeams.append(x)
    advancingTeams.append(y)

round_of_16 = advancingTeams
listTeamWins = predict(round_of_16, ranking, final, logistic)


Iran and Netherlands
Winner: Netherlands 

Senegal and England
Winner: England 

Denmark and Argentina
Winner: Argentina 

Mexico and France
Winner: France 

Spain and Belgium
Winner: Spain 

Croatia and Germany
Winner: Germany 

Uruguay and Brazil
Winner: Brazil 

Serbia and Portugal
Winner: Portugal 





## Making an array to represent which teams are playing against each other in the quarterfinals and making predictions

In [None]:
advancingTeams = []
i=0
for w in range(2):
    x = (listTeamWins[i], listTeamWins[i+2])
    y = (listTeamWins[i+1], listTeamWins[i+3])
    i = i+4
    advancingTeams.append(x)
    advancingTeams.append(y)

quarter_final = advancingTeams
listTeamWins = predict(quarter_final, ranking, final, logistic)


Netherlands and Argentina
Winner: Argentina 

England and France
Winner: England 

Spain and Brazil
Winner: Brazil 

Germany and Portugal
Winner: Portugal 





## Making an array to represent which teams are playing against each other in the semifinals and making predictions

In [None]:
advancingTeams = []
i=0
for w in range(1):
    x = (listTeamWins[i], listTeamWins[i+2])
    y = (listTeamWins[i+1], listTeamWins[i+3])
    i = i+4
    advancingTeams.append(x)
    advancingTeams.append(y)

semi_final = advancingTeams

listTeamWins = predict(semi_final, ranking, final, logistic)


Argentina and Brazil
Winner: Brazil 

Portugal and England
Winner: England 





## Making an array to represent which teams are playing against each other in the finals and making predictions

In [None]:
advancingTeams = [(listTeamWins[0],listTeamWins[1])]

finals = advancingTeams

winner = predict(finals, ranking, final, logistic)


England and Brazil
Winner: Brazil 





## Visualizing the results of the knockout stage in a bracket:

In [None]:
if finals[0][0] not in semi_final[0] and finals[0][1] not in semi_final[0]:
    temp = semi_final[0]
    semi_final[0] = semi_final[1]
    semi_final[1] = temp


quarter_final_2 = []

for team1, team2 in semi_final:
        j = 0
        while True:
            if team1 in quarter_final[j]:
                quarter_final_2.append(quarter_final[j])
                break
            else:
                j+=1
            if (j == len(quarter_final)):
                break

        j = 0
        while True:
            if team2 in quarter_final[j]:
                quarter_final_2.append(quarter_final[j])
                break
            else:
                j+=1

round_of_16_2 = []

for team1, team2 in quarter_final_2:
        j = 0
        while True:
            if team1 in round_of_16[j]:
                round_of_16_2.append(round_of_16[j])
                break
            else:
                j+=1
        j = 0
        while True:
            if team2 in round_of_16[j]:
                round_of_16_2.append(round_of_16[j])
                break
            else:
                j+=1


for team1, team2 in round_of_16_2:
    if (len(team1)+len(team2)) > 16:
      print(team1+"  "+team2+"    ", end="")
    if (len(team1)+len(team2)) > 14:
      print(team1+"   "+team2+"    ", end="")
    else:
      print(team1+"    "+team2+"     ", end="")
print("\n", end ="")
for team1, team2 in round_of_16_2:
    print("   |           |      ", end="")
print("\n", end ="")
for team1, team2 in round_of_16_2:
    print("   |_____ _____|      ", end="")
print("\n", end ="")
for team1, team2 in round_of_16_2:
    print("         |            ", end="")
print("\n", end ="")
for team1, team2 in quarter_final_2:
    if (len(team1)+len(team2)) > 14:
      print("    "+team1+"             "+team2+"         ", end="")
    elif (len(team1)+len(team2)) > 12:
      print("      "+team1+"               "+team2+"          ", end="")
    else:
      print("      "+team1+"                "+team2+"          ", end="")

print("\n", end ="")
for team1, team2 in quarter_final_2:
    print("         |                     |            ", end="")
print("\n", end ="")
for team1, team2 in quarter_final_2:
    print("         |__________ __________|            ", end="")
print("\n", end ="")
for team1, team2 in quarter_final_2:
    print("                    |                       ", end="")
print("\n", end ="")
for team1, team2 in semi_final:
    print("                 "+team1+"                                    "+team2+"                    ", end="")
print("\n", end ="")
for team1, team2 in semi_final:
    print("                    |                                           |                       ", end="")
print("\n", end ="")
for team1, team2 in semi_final:
    print("                    |                                           |                       ", end="")
print("\n", end ="")
for team1, team2 in semi_final:
    print("                    |_____________________ _____________________|                       ", end="")
print("\n", end ="")
for team1, team2 in semi_final:
    print("                                          |                                             ", end="")
print("\n", end ="")
print("                                        "+finals[0][0]+"                                                                                 "+finals[0][1], end="")
print("\n", end ="")
print("                                          |                                                                                       |", end="")
print("\n", end ="")
print("                                          |_____________________________________________ _________________________________________|", end="")
print("\n", end ="")
print("                                                                                        |                                          ", end="")
print("\n", end ="")
print("                                                                                     "+winner[0], end="")

Netherlands   Iran    Argentina   Denmark    Spain    Belgium     Brazil    Uruguay     Senegal    England     Mexico    France     Germany    Croatia     Serbia    Portugal     
   |           |         |           |         |           |         |           |         |           |         |           |         |           |         |           |      
   |_____ _____|         |_____ _____|         |_____ _____|         |_____ _____|         |_____ _____|         |_____ _____|         |_____ _____|         |_____ _____|      
         |                     |                     |                     |                     |                     |                     |                     |            
    Netherlands             Argentina               Spain                Brazil                England               France              Germany             Portugal         
         |                     |                     |                     |                     |                 

### Although there are multiple different ways we can check the accuracy of a classification model such as confusion matrix, F1 score, ROC-AUC curves, as well as multiple values that can be infered from the confusion matrix such as: sensitivity, specificity, precision, and accuracy; I cannot utilize these for this model since we do not know what the actual results are.