## Import Dependencies 

In [2]:
import ibis
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import datetime
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

ibis.options.interactive = True

## Loading in the data

In [61]:
matches = ibis.read('international_matches.csv')

## Understanding the dataset

We'll start by checking out the first few rows 

In [62]:
matches.head()

Next lets get a bit more detailed info about the data in each of the columns

In [63]:
matches.info()

[3m                                  Summary of                                  [0m
[3m    /Users/marlenemhangami/Desktop/visidata_ibis/international_matches_csv    [0m
[3m                                  23921 rows                                  [0m
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
┃[1m [0m[1mName                         [0m[1m [0m┃[1m [0m[1mType                  [0m[1m [0m┃[1m [0m[1m# Nulls[0m[1m [0m┃[1m [0m[1m% Nulls[0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
│ date                          │ [1;35mDate[0m[1m([0m[33mnullable[0m=[3;92mTrue[0m[1m)[0m    │       0 │    0.00 │
│ home_team                     │ [1;35mString[0m[1m([0m[33mnullable[0m=[3;92mTrue[0m[1m)[0m  │       0 │    0.00 │
│ away_team                     │ [1;35mString[0m[1m([0m[33mnullable[0m=[3;92mTrue[0m[1m)[0m  │       0 │    0.00 │
│ home_team_continent       

Let's also get a list of all the columns that are contained in the dataset

In [64]:
matches.columns

['date',
 'home_team',
 'away_team',
 'home_team_continent',
 'away_team_continent',
 'home_team_fifa_rank',
 'away_team_fifa_rank',
 'home_team_total_fifa_points',
 'away_team_total_fifa_points',
 'home_team_score',
 'away_team_score',
 'tournament',
 'city',
 'country',
 'neutral_location',
 'shoot_out',
 'home_team_result',
 'home_team_goalkeeper_score',
 'away_team_goalkeeper_score',
 'home_team_mean_defense_score',
 'home_team_mean_offense_score',
 'home_team_mean_midfield_score',
 'away_team_mean_defense_score',
 'away_team_mean_offense_score',
 'away_team_mean_midfield_score']

In [65]:
matches = matches.execute()

## Preprocessing the data

We need to get our data into a format that the model can understand to make the predictions 

Let's start by cleaning up our data and change all the NAN values in some of the columns we want to use into 0's so that the model can use those columns as predictors too

In [66]:
matches[['home_team_goalkeeper_score',
 'away_team_goalkeeper_score',
 'home_team_mean_defense_score',
 'home_team_mean_offense_score',
 'home_team_mean_midfield_score',
 'away_team_mean_defense_score',
 'away_team_mean_offense_score',
 'away_team_mean_midfield_score']] = matches[['home_team_goalkeeper_score',
 'away_team_goalkeeper_score',
 'home_team_mean_defense_score',
 'home_team_mean_offense_score',
 'home_team_mean_midfield_score',
 'away_team_mean_defense_score',
 'away_team_mean_offense_score',
 'away_team_mean_midfield_score']].fillna(0)

In [67]:
matches

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,shoot_out,home_team_result,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
0,1993-08-08,Bolivia,Uruguay,South America,South America,59,22,0,0,3,...,No,Win,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1993-08-08,Brazil,Mexico,South America,North America,8,14,0,0,1,...,No,Draw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1993-08-08,Ecuador,Venezuela,South America,South America,35,94,0,0,5,...,No,Win,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1993-08-08,Guinea,Sierra Leone,Africa,Africa,65,86,0,0,1,...,No,Win,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1993-08-08,Paraguay,Argentina,South America,South America,67,5,0,0,1,...,No,Lose,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23916,2022-06-14,Moldova,Andorra,Europe,Europe,180,153,932,1040,2,...,No,Win,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23917,2022-06-14,Liechtenstein,Latvia,Europe,Europe,192,135,895,1105,0,...,No,Lose,0.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0
23918,2022-06-14,Chile,Ghana,South America,Africa,28,60,1526,1387,0,...,Yes,Lose,79.0,74.0,75.5,76.7,78.2,75.5,76.0,78.2
23919,2022-06-14,Japan,Tunisia,Asia,Africa,23,35,1553,1499,0,...,No,Lose,73.0,0.0,75.2,75.0,77.5,70.8,72.3,74.0


Next we want to transform the teams into numeric values that the model can understand. We also want to create a column that does the same for the days of a week in case a team plays particularly well on a specific day

In [68]:
matches['away_team_code'] = matches['away_team'].astype('category').cat.codes
matches['day_code'] = matches['date'].dt.dayofweek

In [69]:
matches

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score,away_team_code,day_code
0,1993-08-08,Bolivia,Uruguay,South America,South America,59,22,0,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,202,6
1,1993-08-08,Brazil,Mexico,South America,North America,8,14,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,124,6
2,1993-08-08,Ecuador,Venezuela,South America,South America,35,94,0,0,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,205,6
3,1993-08-08,Guinea,Sierra Leone,Africa,Africa,65,86,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,166,6
4,1993-08-08,Paraguay,Argentina,South America,South America,67,5,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23916,2022-06-14,Moldova,Andorra,Europe,Europe,180,153,932,1040,2,...,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1
23917,2022-06-14,Liechtenstein,Latvia,Europe,Europe,192,135,895,1105,0,...,0.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,107,1
23918,2022-06-14,Chile,Ghana,South America,Africa,28,60,1526,1387,0,...,79.0,74.0,75.5,76.7,78.2,75.5,76.0,78.2,76,1
23919,2022-06-14,Japan,Tunisia,Asia,Africa,23,35,1553,1499,0,...,73.0,0.0,75.2,75.0,77.5,70.8,72.3,74.0,193,1


Next we want to also use the location, whether there were penalties or not to help make the prediction. We also want to be able to have a win represented as a 1 or a 0. This is what we are actaully trying to predict, so our model will use this to train and will also return a 0 or one too

In [70]:
matches.home_team_result

0         Win
1        Draw
2         Win
3         Win
4        Lose
         ... 
23916     Win
23917    Lose
23918    Lose
23919    Lose
23920     Win
Name: home_team_result, Length: 23921, dtype: object

In [71]:
matches['neutral_loc'] = (matches['neutral_location'] == True).astype('int')
matches['penalties'] = (matches['shoot_out'] == 'Yes').astype('int')
matches['target'] = (matches['home_team_result'] == 'Win').astype('int')

In [72]:
matches

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score,away_team_code,day_code,neutral_loc,penalties,target
0,1993-08-08,Bolivia,Uruguay,South America,South America,59,22,0,0,3,...,0.0,0.0,0.0,0.0,0.0,202,6,0,0,1
1,1993-08-08,Brazil,Mexico,South America,North America,8,14,0,0,1,...,0.0,0.0,0.0,0.0,0.0,124,6,0,0,0
2,1993-08-08,Ecuador,Venezuela,South America,South America,35,94,0,0,5,...,0.0,0.0,0.0,0.0,0.0,205,6,0,0,1
3,1993-08-08,Guinea,Sierra Leone,Africa,Africa,65,86,0,0,1,...,0.0,0.0,0.0,0.0,0.0,166,6,0,0,1
4,1993-08-08,Paraguay,Argentina,South America,South America,67,5,0,0,1,...,0.0,0.0,0.0,0.0,0.0,8,6,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23916,2022-06-14,Moldova,Andorra,Europe,Europe,180,153,932,1040,2,...,0.0,0.0,0.0,0.0,0.0,4,1,0,0,1
23917,2022-06-14,Liechtenstein,Latvia,Europe,Europe,192,135,895,1105,0,...,0.0,0.0,0.0,0.0,0.0,107,1,0,0,0
23918,2022-06-14,Chile,Ghana,South America,Africa,28,60,1526,1387,0,...,76.7,78.2,75.5,76.0,78.2,76,1,1,1,0
23919,2022-06-14,Japan,Tunisia,Asia,Africa,23,35,1553,1499,0,...,75.0,77.5,70.8,72.3,74.0,193,1,0,0,0


## Building the model

We're now ready to create our model, train it and get some predictions. 

`Random forest` is a type of ML model that can pick up non-linearities in the data for example for our away team code, doesn't necessarily have a linear relationship, so an away team could ne number 20 but that doesn't imply that the team is better or worse than those with a number higher or lower. They are just values for different teams. A RF model can pick that up whereas a linear model can't 

`n_estimators`, is the number of individual decision trees we want to train. A random forest is a series of decision trees but each decision tree has slightky different parameters. The higher this number is the longer it will take for the algorithm to run but potentially the more accurate it will be. 

`min_sample_split`, is the number of samples we want to have in a decision tree before splitting to a different node. The higher this is the less likely we are to overfit but the lower the accuracy could potentially be. 

`random_state`, just ensures that when you are using the same data you get the same result back. 

In [73]:
rf = RandomForestClassifier(n_estimators=50,min_samples_split=10,random_state=1)

We split the data into training and testing data based on date. All data after 2022 will be used to test, while that before will be used to train 

In [74]:
train = matches[matches['date'] < '2022-01-01']
test = matches[matches['date'] > '2022-01-01']

Next let's choose the predictors, that is which data the model will use to make the final prediction. You can choose whhatever data you want. I just used the one's that made the most sense to me. 

In [75]:
matches.columns

Index(['date', 'home_team', 'away_team', 'home_team_continent',
       'away_team_continent', 'home_team_fifa_rank', 'away_team_fifa_rank',
       'home_team_total_fifa_points', 'away_team_total_fifa_points',
       'home_team_score', 'away_team_score', 'tournament', 'city', 'country',
       'neutral_location', 'shoot_out', 'home_team_result',
       'home_team_goalkeeper_score', 'away_team_goalkeeper_score',
       'home_team_mean_defense_score', 'home_team_mean_offense_score',
       'home_team_mean_midfield_score', 'away_team_mean_defense_score',
       'away_team_mean_offense_score', 'away_team_mean_midfield_score',
       'away_team_code', 'day_code', 'neutral_loc', 'penalties', 'target'],
      dtype='object')

In [76]:
predictors = ['away_team_code', 'day_code', 'neutral_loc', 'penalties', 'home_team_goalkeeper_score', 'away_team_goalkeeper_score',
'home_team_mean_defense_score', 'home_team_mean_offense_score', 'home_team_mean_midfield_score', 'away_team_mean_defense_score',
'away_team_mean_offense_score', 'away_team_mean_midfield_score', 'home_team_fifa_rank', 'away_team_fifa_rank', 'home_team_total_fifa_points','away_team_total_fifa_points']

After that we'll go ahead and fit our model using the training data

In [79]:
rf.fit(train[predictors], train['target'])



Now, lets get our predictions from the test data

In [80]:
preds = rf.predict(test[predictors])



Preds is just an array of 0's and 1's, where 1 is a prediction the home team will win and 0 is a prediction it will lose. We can see this buy grabbing the first 10 elements from it below

In [81]:
preds[:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1])

## Testing the models accuracy

Next, we'll check the accuracy of our predictions by comparing the actual target test wins and losses with the predictions. Let's first use the accuracy score to check how well our model performed.

In [82]:
acc = accuracy_score(test['target'], preds)

In [83]:
acc

0.7005253940455342

In [88]:
combined = pd.DataFrame(dict(actual=test['target'], predcition=preds))

In [89]:
pd.crosstab(index=combined['actual'], columns=combined['predcition'])

predcition,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,199,80
1,91,201


Now let's get the data for the current teams and try to predict what will happen! 

## Who will win??? Getting the prediction for the World Cup Champion

There are now 4 more games that will be played in the Fifa World Cup! 
Argentina will play against France for the championship, and Morroco and Croatia will fight for the 3rd spot. Lets write some code to get as recent as possible details about all of these teams and then make predictions.

Firtsly, I went to [Fifa's website](https://www.fifa.com/fifa-world-ranking/men?dateId=id13792) to get the most current rank and points of each team. Here they are below. 

France
- ranking = 4 
- points = 1759

Morocco
- ranking = 22
- points = 1563.5

Argentina
- ranking = 3
- points = 1773.88

Croatia
- ranking = 12
- points = 1645.64

## Isolating the teams left 

Let's get the latest home and away game for each of these finalist teams in our table and combine them into one dataframe 

In [90]:
france_home = matches[matches['home_team'] == 'France'].sort_values(['date'], ascending = False)[:1]
france_away = matches[matches['away_team'] == 'France'].sort_values(['date'], ascending = False)[:1]
morocco_home = matches[matches['home_team'] == 'Morocco'].sort_values(['date'], ascending = False)[:1]
morocco_away= matches[matches['away_team'] == 'Morocco'].sort_values(['date'], ascending = False)[:1]
argentina_home = matches[matches['home_team'] == 'Argentina'].sort_values(['date'], ascending = False)[:1]
argentina_away = matches[matches['away_team'] == 'Argentina'].sort_values(['date'], ascending = False)[:1]
croatia_home = matches[matches['home_team'] == 'Croatia'].sort_values(['date'], ascending = False)[:1]
croatia_away = matches[matches['away_team'] == 'Croatia'].sort_values(['date'], ascending = False)[:1]


In [91]:
semi_fin = france_home.append([france_away, morocco_home, morocco_away, argentina_home, argentina_away, croatia_home, croatia_away])

  semi_fin = france_home.append([france_away, morocco_home, morocco_away, argentina_home, argentina_away, croatia_home, croatia_away])


In [92]:
semi_fin

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score,away_team_code,day_code,neutral_loc,penalties,target
23885,2022-06-13,France,Croatia,Europe,Europe,3,16,1789,1621,0,...,88.3,86.8,77.8,76.7,84.2,49,0,0,0,0
23826,2022-06-10,Austria,France,Europe,Europe,34,3,1500,1789,1,...,77.0,80.8,84.2,88.3,86.8,71,4,0,0,0
23879,2022-06-13,Morocco,Liberia,Africa,Africa,24,149,1551,1050,2,...,81.7,76.2,0.0,64.0,0.0,110,0,0,0,1
23665,2022-06-01,USA,Morocco,North America,Africa,15,24,1633,1551,3,...,77.7,75.8,81.2,81.7,76.2,129,2,0,0,1
23741,2022-06-05,Argentina,Estonia,South America,Europe,4,110,1765,1169,5,...,89.0,84.0,0.0,0.0,0.0,65,6,1,0,1
23653,2022-06-01,Italy,Argentina,Europe,South America,6,4,1723,1765,0,...,85.3,84.5,82.2,89.0,84.0,8,2,1,0,0
23753,2022-06-06,Croatia,France,Europe,Europe,16,3,1621,1789,1,...,76.7,84.2,84.2,88.3,86.8,71,0,0,0,0
23885,2022-06-13,France,Croatia,Europe,Europe,3,16,1789,1621,0,...,88.3,86.8,77.8,76.7,84.2,49,0,0,0,0


## Updating the rank and points

Now lets update the fifa rank and score to what's currently on fifa's website. I'm doing this by hand because the df is fairly small

In [93]:
semi_fin['home_team_fifa_rank'] = [4,33,22,15,3,6, 16, 4]
semi_fin['home_team_total_fifa_points'] = [1759,1500,1563,1633,1765,1773,1621,1759]
semi_fin['away_team_fifa_rank'] = [16,4,149,22,110,3,4,16]
semi_fin['away_team_total_fifa_points'] = [1621,1759,1050,1563,1169,1765,1759,1621]

In [94]:
semi_fin

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score,away_team_code,day_code,neutral_loc,penalties,target
23885,2022-06-13,France,Croatia,Europe,Europe,4,16,1759,1621,0,...,88.3,86.8,77.8,76.7,84.2,49,0,0,0,0
23826,2022-06-10,Austria,France,Europe,Europe,33,4,1500,1759,1,...,77.0,80.8,84.2,88.3,86.8,71,4,0,0,0
23879,2022-06-13,Morocco,Liberia,Africa,Africa,22,149,1563,1050,2,...,81.7,76.2,0.0,64.0,0.0,110,0,0,0,1
23665,2022-06-01,USA,Morocco,North America,Africa,15,22,1633,1563,3,...,77.7,75.8,81.2,81.7,76.2,129,2,0,0,1
23741,2022-06-05,Argentina,Estonia,South America,Europe,3,110,1765,1169,5,...,89.0,84.0,0.0,0.0,0.0,65,6,1,0,1
23653,2022-06-01,Italy,Argentina,Europe,South America,6,3,1773,1765,0,...,85.3,84.5,82.2,89.0,84.0,8,2,1,0,0
23753,2022-06-06,Croatia,France,Europe,Europe,16,4,1621,1759,1,...,76.7,84.2,84.2,88.3,86.8,71,0,0,0,0
23885,2022-06-13,France,Croatia,Europe,Europe,4,16,1759,1621,0,...,88.3,86.8,77.8,76.7,84.2,49,0,0,0,0


## Create a DF that has the correct teams playing each other 

There are a couple of other things we want to update too so the data represents. lets first get all the columns with home_team info in them and away in them 

In [95]:
home_stats = semi_fin.filter(regex='home')
away_stats = semi_fin.filter(regex='away')

let's combine the data so that the relevant contries are playing against each other 

In [96]:
final_home = home_stats[(home_stats['home_team'] == 'Argentina') | (home_stats['home_team'] == 'France')]
final_away = away_stats[(away_stats['away_team'] == 'Argentina') | (away_stats['away_team'] == 'France')]

remove one row from each df because we only need the latest 2

In [97]:
final_home = final_home[:-1]
final_away = final_away[:-1]


insert the missing data into the final home dataframe, we need these columns for making predictions. 

In [98]:
dates= [pd.to_datetime('2022-12-18'), pd.to_datetime('2022-12-18')]

final_home.insert(loc=0,column='date', value=dates)
final_home['day_code'] = [6,6]
final_home['neutral_loc'] = [1,1]
final_home['penalties'] = [0,0]


We change the index so that we can join the tables together on the index and also so we can have the away and home teams being correct 

In [99]:
final_home.index =[0,1]
final_away.index =[1,0]
final_away

Unnamed: 0,away_team,away_team_continent,away_team_fifa_rank,away_team_total_fifa_points,away_team_score,away_team_goalkeeper_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score,away_team_code
1,France,Europe,4,1759,1,87.0,84.2,88.3,86.8,71
0,Argentina,South America,3,1765,3,84.0,82.2,89.0,84.0,8


join the two tables to make the finals dataframe

In [100]:
finals = pd.concat([final_home, final_away],axis=1)

## Argentina vs France

In [105]:
finals

Unnamed: 0,date,home_team,home_team_continent,home_team_fifa_rank,home_team_total_fifa_points,home_team_score,home_team_result,home_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,...,away_team,away_team_continent,away_team_fifa_rank,away_team_total_fifa_points,away_team_score,away_team_goalkeeper_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score,away_team_code
0,2022-12-18,France,Europe,4,1759,0,Lose,87.0,84.2,88.3,...,Argentina,South America,3,1765,3,84.0,82.2,89.0,84.0,8
1,2022-12-18,Argentina,South America,3,1765,5,Win,84.0,82.2,89.0,...,France,Europe,4,1759,1,87.0,84.2,88.3,86.8,71


In [106]:
finals[['home_team', 'away_team']]

Unnamed: 0,home_team,away_team
0,France,Argentina
1,Argentina,France


## Using our model to make the prediction

let's see who wins, remember our model will return a 1 if the home team wins and a 0 if it loses lets pass throu the finals df 

In [None]:
finals_preds = rf.predict(finals[predictors])

## Prediction!!!

The prediction is...

In [108]:
finals_preds

array([1, 0])

Next lets try this for the match for 3rd and 4th place 

# Morocco vs Croatia

In [109]:
final2_home = home_stats[(home_stats['home_team'] == 'Morocco') | (home_stats['home_team'] == 'Croatia')]
final2_away = away_stats[(away_stats['away_team'] == 'Morocco') | (away_stats['away_team'] == 'Croatia')]

In [None]:
final2_away = final2_away[:-1]
dates= [pd.to_datetime('2022-12-17'), pd.to_datetime('2022-12-17')]

final2_home.insert(loc=0,column='date', value=dates)
final2_home['day_code'] = [5,5]
final2_home['neutral_loc'] = [1,1]
final2_home['penalties'] = [0,0]

final2_home.index =[0,1]
final2_away.index =[0,1]
finals2 = pd.concat([final2_home, final2_away],axis=1)

In [111]:
finals2

Unnamed: 0,date,home_team,home_team_continent,home_team_fifa_rank,home_team_total_fifa_points,home_team_score,home_team_result,home_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,...,away_team,away_team_continent,away_team_fifa_rank,away_team_total_fifa_points,away_team_score,away_team_goalkeeper_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score,away_team_code
0,2022-12-17,Morocco,Africa,22,1563,2,Win,82.0,81.2,81.7,...,Croatia,Europe,16,1621,1,82.0,77.8,76.7,84.2,49
1,2022-12-17,Croatia,Europe,16,1621,1,Draw,82.0,77.8,76.7,...,Morocco,Africa,22,1563,0,82.0,81.2,81.7,76.2,129


In [112]:
finals2[['home_team', 'away_team']]

Unnamed: 0,home_team,away_team
0,Morocco,Croatia
1,Croatia,Morocco


In [113]:
finals2_preds = rf.predict(finals2[predictors])



## Prediction!!!

the prediction is...

In [114]:
finals2_preds

array([0, 1])