In [1]:
# Utils
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn.preprocessing import MinMaxScaler
import warnings
from pathlib import Path

# 
import keras
import tensorflow as tf
from tensorflow import keras

##
warnings.filterwarnings("ignore")

In [2]:
# Configs
sample_data_path = Path('../data/01_samples')
data_path =  Path('../data/02_outputs')
model_path = Path('../models/rnn_v1')

In [3]:
# Funções
def create_teams_df(data):
        
    data_home = data[['year','home_team_name','home_n_participation','year_n']]
    data_away = data[['year','away_team_name','away_n_participation','year_n']]
    data_home.rename(columns={'home_team_name':'team', 'home_n_participation':'n_participation'}, inplace=True)
    data_away.rename(columns={'away_team_name':'team', 'away_n_participation':'n_participation'}, inplace=True)
    
    data_final = pd.concat([data_home, data_away], ignore_index=True)
    data_final.drop_duplicates(inplace=True)
    
    return data_final

## Tratamento de dados

# Função para padronizar os nomes entre as bases de partidas e ranking ECO
country_dict = {'North Ireland':'Northern Ireland',
                'Qatar':'United Arab Emirates',
                'Bosnia/Herzeg':'Bosnia and Herzegovina',
                'Czechia':'Czech Republic',
                'Ireland':'Republic of Ireland',
                'Trinidad/Tob': 'Trinidad and Tobago',
                'Serbia/Mont':'Serbia and Montenegro'}

def fix_country_name(data):
    
    if data['team'] in country_dict.keys():
        return country_dict[data['team']]
        
    else:
        return data['team']
    
def fix_missing_score(data, feature):
    
    base_col = '{f}_team_name'
    
    if data[base_col.format(f=feature)] == 'West Germany' and data['year'] == 1990:
        return 2082
    
    elif data[base_col.format(f=feature)] == 'Serbia and Montenegro' and data['year'] == 2006:
        return 1690
    
    else:
        return data['{f}_score'.format(f=feature)]

def fix_missing_rank(data, feature):
    
    base_col = '{f}_team_name'
    
    if data[base_col.format(f=feature)] == 'West Germany' and data['year'] == 1990:
        return 1
    
    elif data[base_col.format(f=feature)] == 'Serbia and Montenegro' and data['year'] == 2006:
        return 40
    
    else:
        return data['{f}_rank'.format(f=feature)]

In [4]:
# Carregando os dados e Preparando os dados
df = pd.read_csv(data_path / '01_matches_eco_score.csv')

df.drop(['match_date', 'best_rank_winner', 'team_score_diff',
         'draw', 'home_winner','draw', 'away_winner'],
         axis=1, inplace=True)

df['year_n'] = df['year']
df.dropna(inplace=True)
df.head()

Unnamed: 0,home_team_name,away_team_name,home_team_score,away_team_score,year,home_rank,home_score,away_rank,away_score,home_n_participation,away_n_participation,rank_diff,score_diff,year_n
0,France,Mexico,4,1,1930,35.0,1566.0,41.0,1497.0,0,0,-6.0,69.0,1930
1,United States,Belgium,3,0,1930,14.0,1762.0,31.0,1593.0,0,0,-17.0,169.0,1930
2,Yugoslavia,Brazil,2,1,1930,25.0,1658.0,9.0,1890.0,0,0,16.0,-232.0,1930
3,Romania,Peru,3,1,1930,37.0,1544.0,42.0,1496.0,0,0,-5.0,48.0,1930
4,Argentina,France,1,0,1930,1.0,2062.0,35.0,1566.0,0,0,-34.0,496.0,1930


In [5]:
df = df[['year','home_team_name','away_team_name','home_n_participation','away_n_participation','year_n']]

In [6]:
df = create_teams_df(df)

In [7]:
# adicionando os dados do ranking
df_elo_ranking = pd.read_csv(sample_data_path / 'scraped/elo_scores.csv')
df_elo_ranking['team'] = df_elo_ranking.apply(fix_country_name, axis=1)


df = df.merge(df_elo_ranking, on=['team', 'year'], how='left')

In [8]:
for col in ['year_n', 'n_participation','rank','score']:
        
    if col == 'year_n':
        df[col] = (df[col] - 1930) / (2022 - 1930)
            
    else:
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

In [9]:
df_2014 = df[df['year'] == 2014]
df_2018 = df[df['year'] == 2018]

In [10]:
## Simulação Copa 2014

grupo_a = ['Brazil', 'Mexico', 'Croatia','Cameroon']
grupo_e = ['France', 'Switzerland','Ecuador','Honduras']
grupo_b = ['Netherlands','Chile','Spain','Australia']
grupo_f = ['Argentina','Nigeria','Bosnia and Herzegovina','Iran']
grupo_c = ['Colombia','Greece','Ivory Coast','Japan']
grupo_g = ['Germany','United States','Portugal','Ghana']
grupo_d = ['Costa Rica','Uruguay','Italy','England']
grupo_h = ['Belgium','Algeria','Russia','South Korea']

In [143]:
def fase_grupos(grupo, data, model):
    
    pontos_time_1 = 0
    pontos_time_2 = 0
    pontos_time_3 = 0
    pontos_time_4 = 0
     
    for i in range(len(grupo)-1):
        for j in range(i+1,len(grupo)):
            dados_partida = list()

            time_1 = data[data['team'] == grupo[i]]
            time_1.reset_index(drop=True, inplace=True)
            time_2 = data[data['team'] == grupo[i]]
            time_2.reset_index(drop=True, inplace=True)
            
            dados_partida.append(time_1['year_n'][0])
            dados_partida.append(time_1['rank'][0])
            dados_partida.append(time_1['score'][0])
            dados_partida.append(time_2['rank'][0])
            dados_partida.append(time_2['score'][0])
            dados_partida.append(time_1['n_participation'][0])
            dados_partida.append(time_2['n_participation'][0])
            dados_partida.append((time_1['rank'][0] - time_2['rank'][0]))
            dados_partida.append((time_1['score'][0] - time_2['score'][0]))
            
            predictions = model.predict(np.array([dados_partida]))
            
            time_a = predictions[0]
            time_b = predictions[1]
            
            time_a = [np.round(x) for x in time_a]
            time_b = [np.round(x) for x in time_b]
            
            print(grupo[i], time_a[0][0], " X ", time_b[0][0], grupo[j])
            
            if i == 0:
                if j == 1:
                    if time_a[0] > time_b[0]:
                        pontos_time_1 += 3
                        
                    if time_a[0] == time_b[0]:
                        pontos_time_1 += 1
                        pontos_time_2 += 1
                        
                    if time_a[0][0] < time_b[0][0]:
                        pontos_time_2 += 3
                        
                if j == 2:
                    if time_a[0][0] > time_b[0][0]:
                        pontos_time_1 += 3
                        
                    if time_a[0][0] == time_b[0][0]:
                        pontos_time_1 += 1
                        pontos_time_3 += 1
                        
                    if time_a[0][0] < time_b[0][0]:
                        pontos_time_3 += 3
                        
                if j == 3:
                    if time_a[0][0] > time_b[0][0]:
                        pontos_time_1 += 3
                        
                    if time_a[0][0] == time_b[0][0]:
                        pontos_time_1 += 1
                        pontos_time_4 += 1
                        
                    if time_a[0][0] < time_b[0][0]:
                        pontos_time_3 += 3
                        
            if i == 1:
                        
                if j == 2:
                    if time_a[0][0] > time_b[0][0]:
                        pontos_time_2 += 3
                        
                    if time_a[0][0] == time_b[0][0]:
                        pontos_time_2 += 1
                        pontos_time_3 += 1
                        
                    if time_a[0][0] < time_b[0][0]:
                        pontos_time_3 += 3
                        
                if j == 3:
                    if time_a[0][0] > time_b[0][0]:
                        pontos_time_2 += 3
                        
                    if time_a[0][0] == time_b[0][0]:
                        pontos_time_2 += 1
                        pontos_time_4 += 1
                        
                    if time_a[0][0] < time_b[0][0]:
                        pontos_time_4 += 3
                        
            if i == 2:
                        
                if time_a[0][0] > time_b[0][0]:
                    pontos_time_3 += 3

                if time_a[0][0] == time_b[0][0]:
                    pontos_time_3 += 1
                    pontos_time_4 += 1

                if time_a[0][0] < time_b[0][0]:
                    pontos_time_4 += 3
                        
    return pontos_time_1, pontos_time_2, pontos_time_3, pontos_time_4

In [227]:
test_model = tf.keras.models.load_model(model_path / 'model_256n_32b_0.5lr.h5')

fase_grupos(grupo_h, df_2014, test_model)

Belgium 2.0  X  1.0 Algeria
Belgium 2.0  X  1.0 Russia
Belgium 2.0  X  1.0 South Korea
Algeria 2.0  X  1.0 Russia
Algeria 2.0  X  1.0 South Korea
Russia 2.0  X  2.0 South Korea


(9, 6, 1, 1)

In [226]:
best_model = ['model_256n_32b_0.5lr.h5']