Imports

In [1229]:

import pandas as pd
import numpy as np

import unidecode
import re
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm
from sklearn.metrics import r2_score

#football_data.to_csv('cleaned_football_data.csv', index=False)

Read data

In [1230]:
football_data = pd.read_csv('data/futebol.csv')


In [1231]:
football_data.shape

(1564, 7)

In [1232]:
football_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1564 entries, 0 to 1563
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Data          1564 non-null   object 
 1   Jogo          1564 non-null   object 
 2   Minutos       1473 non-null   object 
 3   Investimento  1564 non-null   int64  
 4   Odd           1564 non-null   float64
 5   Ganho         1564 non-null   float64
 6   Resultado     1564 non-null   object 
dtypes: float64(2), int64(1), object(4)
memory usage: 85.7+ KB


Translate columms to english

In [1233]:
football_data.rename(columns={
    'Data': 'Date',
    'Jogo': 'Match',
    'Minutos': 'Bet Type',
    'Investimento': 'Bet Amount',
    'Odd': 'Odds',
    'Ganho': 'Profit',
    'Resultado': 'Win/Loss'
}, inplace=True)

In [1234]:
football_data.sample(10)

Unnamed: 0,Date,Match,Bet Type,Bet Amount,Odds,Profit,Win/Loss
812,2022-08-29,Corinthains x RB,00 a 10,10,1.57,5.7,Green
1499,2022-10-26,Univ. de Chile x U. Española,Gols,20,1.5,10.0,Red
1484,2022-10-25,Athletico x Palmeiras,Gols,10,1.5,5.0,Red
1456,2022-10-22,Valência x Maiorca,Gols,20,1.5,10.0,Red
674,2022-08-22,Albion x Plaza,00 a 10,10,1.57,5.7,Green
1407,2022-10-19,Elche x Real Madrid,60 a 70,10,1.57,5.7,Green
476,2022-08-09,Viktoria x Sheriff,,10,1.53,5.3,Green
1092,2022-09-20,Once Caldas x Envigado,20 a 30,10,1.66,6.6,Green
920,2022-09-07,Deagu x Seongnam,60 a 70,10,1.57,5.7,Green
1223,2022-10-06,America x SP,20 a 30,10,1.5,5.0,Green


Checking for dublicates

In [1235]:
football_data.duplicated().sum()

22

In [1236]:
#dropping duplicates
football_data.drop_duplicates(inplace=True)

Check for missing values

In [1237]:
football_data.isnull().sum()

Date           0
Match          0
Bet Type      88
Bet Amount     0
Odds           0
Profit         0
Win/Loss       0
dtype: int64

In [1238]:
football_data['Bet Type'].unique()

array(['60 ao 70', '10 ao 20', '20 ao 30', 'Gols', '00 ao 10',
       'Escanteio', 'Vencedor', '50 ao 60', '00 a 10', '30 ao 40', nan,
       'Cartões', '70 ao 80', 'Asiáticos', '00 a 10 ', 'Multipla',
       'aos 30', '20 aos 30', '10 aos 20', '60 aos 70', '20 a 30',
       '10 a 20', '60 a 70', '80 a 90', '70 a 80', '30 a 40', '50 a 60',
       'Múltipla', '20 a 30 ', '2 marcam', 'gols', 'Mult', 'Pedro',
       '60 a 70 ', 'Gols duplo'], dtype=object)

replaced null with zero

In [1239]:
football_data.fillna(0, inplace=True)

In [1240]:
football_data['Bet Type'].value_counts()

Bet Type
00 a 10       257
20 ao 30      193
60 ao 70      174
Gols          148
10 ao 20      147
20 a 30       125
10 a 20       103
60 a 70        92
0              88
00 ao 10       85
30 a 40        22
30 ao 40       17
70 a 80        15
50 a 60        11
50 ao 60       10
70 ao 80       10
Asiáticos       8
80 a 90         8
20 a 30         5
20 aos 30       4
Multipla        4
aos 30          2
2 marcam        2
60 aos 70       1
10 aos 20       1
00 a 10         1
Cartões         1
Vencedor        1
Múltipla        1
Escanteio       1
gols            1
Mult            1
Pedro           1
60 a 70         1
Gols duplo      1
Name: count, dtype: int64

Normalize the data lowercased, stripped, and accents removed


In [1241]:
print("Unique Bet Type:", football_data['Bet Type'].nunique())


Unique Bet Type: 35


In [1242]:
def normalize_bet_type(value):
    if pd.isna(value):
        return value
    return unidecode.unidecode(str(value).strip().lower())

football_data['Bet Type'] = football_data['Bet Type'].apply(normalize_bet_type)

Standardize the time ranges from "10 ao 20" or 10 a 20 to 10-20

In [1243]:
def standardize_time_ranges(value):
    if isinstance(value, str):
        return (
            value.replace(" a ", "-")
                 .replace(" ao ", "-")
                 .replace(" aos ", "-")
                 .strip()
        )
    return value
football_data['Bet Type'] = football_data['Bet Type'].apply(standardize_time_ranges)

In [1244]:
football_data['Bet Type'].value_counts()

Bet Type
00-10         343
20-30         327
60-70         268
10-20         251
gols          149
0              88
30-40          39
70-80          25
50-60          21
asiaticos       8
80-90           8
multipla        5
2 marcam        2
aos 30          2
pedro           1
mult            1
cartoes         1
vencedor        1
escanteio       1
gols duplo      1
Name: count, dtype: int64

In [1245]:
#remove damaged data
football_data = football_data[football_data['Bet Type'] != 'Pedro']

Group rare bet types into "Others"

In [1246]:
threshold = 10

value_counts = football_data['Bet Type'].value_counts()

rare_types = value_counts[value_counts < threshold].index

def group_rare_bet_types(bet_type):
    if bet_type in rare_types:
        return 'Other'
    return bet_type

football_data['Bet Type'] = football_data['Bet Type'].apply(group_rare_bet_types)

In [1247]:
football_data['Bet Type'].value_counts()

Bet Type
00-10    343
20-30    327
60-70    268
10-20    251
gols     149
0         88
30-40     39
Other     31
70-80     25
50-60     21
Name: count, dtype: int64

In [1248]:
print("Unique Bet Type:", football_data['Bet Type'].nunique())
#reduced from 35 to 10

Unique Bet Type: 10


In [1249]:
football_data.dtypes

Date           object
Match          object
Bet Type       object
Bet Amount      int64
Odds          float64
Profit        float64
Win/Loss       object
dtype: object

splits the column "Match" to be "Home Team" and "Away Team". 

Checked sample data with https://www.football-data.org/coverage that it matches. Home and Away team

In [1250]:
#remove known bad data
football_data = football_data[football_data['Match'] != 'Múltipla 4']

# Split the 'Match' column into 'Home Team' and 'Away Team'
football_data[['Home Team', 'Away Team']] = football_data['Match'].str.split(' x ', expand=True)



#drop match column
football_data = football_data.drop(columns=['Match'])

new_order = ['Date', 'Home Team', 'Away Team', 'Bet Type', 'Bet Amount', 'Odds', 'Profit', 'Win/Loss']

#Reorder the columns
football_data = football_data[new_order]

In [1251]:
football_data[football_data['Away Team'].isnull()]

Unnamed: 0,Date,Home Team,Away Team,Bet Type,Bet Amount,Odds,Profit,Win/Loss
364,2022-08-02,Colon X Indepediente,,10-20,10,1.53,5.3,Green
655,2022-08-21,Leeds X Chelsea,,20-30,10,1.5,5.0,Green
944,2022-09-08,SP X Atl Go,,10-20,10,1.44,4.4,Green


manuel fix for the damaged data

In [1252]:

football_data['Home Team'] = football_data['Home Team'].str.lower().str.strip()

manual_fix_map = {
    'colon x indepediente': ['Colon', 'Indepediente'],
    'leeds x chelsea': ['Leeds', 'Chelsea'],
    'sp x atl go': ['SP', 'Atl Go']
}

for bad_value, (home_fixed, away_fixed) in manual_fix_map.items():
    mask = (football_data['Home Team'] == bad_value) & (football_data['Away Team'].isnull())
    football_data.loc[mask, 'Home Team'] = home_fixed
    football_data.loc[mask, 'Away Team'] = away_fixed

In [1253]:
football_data[football_data['Away Team'].isnull()]

Unnamed: 0,Date,Home Team,Away Team,Bet Type,Bet Amount,Odds,Profit,Win/Loss


Standardize the team names

In [1254]:
football_data['Home Team'].unique()

array(['inglaterra', 'ayacucho', 'lara', 'guairena', 'cordoba',
       'belgrano', 'mar', 'cruzeiro', 'col', 'brusque', 'huachipate',
       'braga', 'colorado', 'dallas', 'patchuca', 'galaxy', 'morelia',
       'boca', 'libertad', 'inter', 'sucre', 'hailstorm', 'ready',
       'argentinos', 'colon', 'ceara', 'palmeiras', 'detroit', 'mineros',
       'river', 'melgar', 'union', 'lanus', 'stalbaek', 'asane',
       'patriotas', 'selangor', 'aksur', 'forest', 'chicago', 'vila',
       'celaya', 'rosário', 'dynamo', 'portim', 'brighton', 'nyc',
       'lorenzo', 'atlanta', 'goias', 'platense', 'petrolero',
       'millionarios', 'houston', 'gualaceo', 'unido', 'pepo', 'pif',
       'rosemberg', 'kalmar', 'jorge w.', 'tolima', 'velez', 'ilves',
       'defensa', 'bagre', 'varnamo', 'guabira', 'turan', 'din', 'talaea',
       'el zamalek', 'alianza', 'banfield', 'austin', 'cimarrones',
       'cortuba', 'dc united', 'breidablik', 'operario', 'resistencia',
       'huracan', 'estudiantes', '

In [1255]:
print("Unique home teams:", football_data['Home Team'].nunique())
print("Unique away teams:", football_data['Away Team'].nunique())


Unique home teams: 784
Unique away teams: 885


In [1256]:
football_data.isnull().sum()

Date          0
Home Team     0
Away Team     0
Bet Type      0
Bet Amount    0
Odds          0
Profit        0
Win/Loss      0
dtype: int64

In [1257]:
#normalize team names
football_data['Home Team'] = football_data['Home Team'].str.lower().str.strip()
football_data['Away Team'] = football_data['Away Team'].str.lower().str.strip()



In [1258]:
team_name_map = {
    # Fix accents and formatting
    'rosário': 'rosario',
    'atlético mg': 'atl mg',
    'atlético fênix': 'atl fênix',
    'atlético madrid': 'atletico madrid',
    'universitário': 'universitario',
    'ceará': 'ceara',
    'cuiabá': 'cuiaba',
    'san lorenzo res': 'san lorenzo',
    'banfield res': 'banfield',
    'talleres res': 'talleres',
    '  gremio': 'gremio',
    'croácia': 'croatia',
    'países baixos': 'netherlands',
    'santa fé': 'santa fe',
    'atl fenix': 'atl fênix',
    'atlético go': 'atl go',
    'indepediente': 'independiente',
    'gales': 'wales',
    'del valle': 'ind. del valle',
    
    # Consolidate naming variations
    'ny rb': 'ny red bulls',
    'nyc': 'ny city',
    'ny': 'ny city',
    'charllote': 'charlotte',
    'ny red bulls': 'ny red bulls',  # unify spelling
    
    # Translate or unify other known aliases
    'zurique': 'zurich',
    'guilhermo': 'guillermo',
    'vila': 'vila nova',
    'croacia': 'croatia',
    'dinamarca': 'denmark',
    'país de gales': 'wales',
    'paises baixos': 'netherlands',
    'espanha': 'spain',
    'alemanha': 'germany',
    'frança': 'france',
    'islandia': 'iceland',
    'nikobing': 'nykobing',
}

football_data['Home Team'] = football_data['Home Team'].replace(team_name_map)
football_data['Away Team'] = football_data['Away Team'].replace(team_name_map)


In [1259]:
#remove damaged data
bad_teams = ['A', 'B', 'FF', 'PK', 'Múltipla 4']
football_data = football_data[
    ~football_data['Home Team'].isin(bad_teams) &
    ~football_data['Away Team'].isin(bad_teams)
]

In [1260]:
print("Unique home teams:", football_data['Home Team'].nunique())
print("Unique away teams:", football_data['Away Team'].nunique())


Unique home teams: 770
Unique away teams: 792


In [1261]:
football_data.sample(10)

Unnamed: 0,Date,Home Team,Away Team,Bet Type,Bet Amount,Odds,Profit,Win/Loss
133,2022-07-16,arges,arad,10-20,10,1.66,6.6,Green
557,2022-08-15,bilbao,mallorca,50-60,10,1.53,5.3,Green
1330,2022-10-12,rangers,liverpool,00-10,40,1.5,20.0,Red
14,2022-07-04,colorado,austin,00-10,10,1.53,5.3,Green
767,2022-08-26,gremio,ituano,00-10,10,1.57,5.7,Green
645,2022-08-20,celta de vigo,real madrid,00-10,10,1.61,6.1,Green
156,2022-07-16,palmaflor,aurora,20-30,10,1.57,5.7,Green
410,2022-08-05,guarani,gremio,0,10,1.5,5.0,Green
1510,2022-10-27,cerro porteño,gen caballero,gols,10,1.5,5.0,Red
1471,2022-10-22,cerro largo,peñarol,gols,10,5.0,40.0,Green


Label encoding

In [1268]:
football_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1541 entries, 0 to 1563
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        1541 non-null   object 
 1   Home Team   1541 non-null   int32  
 2   Away Team   1541 non-null   int32  
 3   Bet Amount  1541 non-null   int64  
 4   Odds        1541 non-null   float64
 5   Profit      1541 non-null   float64
 6   Win/Loss    1541 non-null   int64  
 7   Bet_0       1541 non-null   bool   
 8   Bet_00-10   1541 non-null   bool   
 9   Bet_10-20   1541 non-null   bool   
 10  Bet_20-30   1541 non-null   bool   
 11  Bet_30-40   1541 non-null   bool   
 12  Bet_50-60   1541 non-null   bool   
 13  Bet_60-70   1541 non-null   bool   
 14  Bet_70-80   1541 non-null   bool   
 15  Bet_Other   1541 non-null   bool   
 16  Bet_gols    1541 non-null   bool   
dtypes: bool(10), float64(2), int32(2), int64(2), object(1)
memory usage: 99.3+ KB


convert Win/Loss to binary

In [1263]:
football_data['Win/Loss'] = football_data['Win/Loss'].map({'Green': 1, 'Red': 0})

change Bet type to One-hot encoding

In [1264]:
football_data = pd.get_dummies(football_data, columns=['Bet Type'], prefix='Bet')

In [1265]:
football_data.head()

Unnamed: 0,Date,Home Team,Away Team,Bet Amount,Odds,Profit,Win/Loss,Bet_0,Bet_00-10,Bet_10-20,Bet_20-30,Bet_30-40,Bet_50-60,Bet_60-70,Bet_70-80,Bet_Other,Bet_gols
0,2022-07-01,inglaterra,israel,10,1.57,5.7,1,False,False,False,False,False,False,True,False,False,False
1,2022-07-01,ayacucho,cantolao,10,1.5,5.0,1,False,False,True,False,False,False,False,False,False,False
2,2022-07-01,ayacucho,cantolao,10,1.5,5.0,1,False,False,False,True,False,False,False,False,False,False
3,2022-07-01,lara,"ucv ac. 0,5 gols",10,1.61,6.1,0,False,False,False,False,False,False,False,False,False,True
4,2022-07-01,guairena,asuncion,10,1.57,5.7,1,False,False,False,True,False,False,False,False,False,False


label encode home and away team and make sure a team in Home will have the same ID in Away

In [1266]:
all_teams = pd.concat([football_data['Home Team'], football_data['Away Team']]).unique()

# Fit one LabelEncoder on all teams
team_encoder = LabelEncoder()
team_encoder.fit(all_teams)

# Apply the same encoder to both columns
football_data['Home Team'] = team_encoder.transform(football_data['Home Team'])
football_data['Away Team'] = team_encoder.transform(football_data['Away Team'])

In [1267]:
football_data.to_csv('cleaned_football_data.csv', index=False)