Imports

In [561]:

import pandas as pd
import numpy as np

import unidecode
import re

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm
from sklearn.metrics import r2_score

#football_data.to_csv('cleaned_football_data.csv', index=False)

Read data

In [562]:
football_data = pd.read_csv('data/futebol.csv')


In [563]:
football_data.shape

(1564, 7)

In [564]:
football_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1564 entries, 0 to 1563
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Data          1564 non-null   object 
 1   Jogo          1564 non-null   object 
 2   Minutos       1473 non-null   object 
 3   Investimento  1564 non-null   int64  
 4   Odd           1564 non-null   float64
 5   Ganho         1564 non-null   float64
 6   Resultado     1564 non-null   object 
dtypes: float64(2), int64(1), object(4)
memory usage: 85.7+ KB


Translate columms to english

In [565]:
football_data.rename(columns={
    'Data': 'Date',
    'Jogo': 'Match',
    'Minutos': 'Bet Type',
    'Investimento': 'Bet Amount',
    'Odd': 'Odds',
    'Ganho': 'Profit',
    'Resultado': 'Win/Loss'
}, inplace=True)

In [566]:
football_data.sample(10)

Unnamed: 0,Date,Match,Bet Type,Bet Amount,Odds,Profit,Win/Loss
144,2022-07-16,Orense x Macara,20 ao 30,10,1.61,6.1,Green
7,2022-07-01,Belgrano x Almagro,60 ao 70,5,1.53,2.65,Green
797,2022-08-27,Flu x Palmeiras,Gols,10,1.5,5.0,Red
475,2022-08-09,Suburbs x Gold Coast,,10,1.61,6.1,Green
1290,2022-10-11,Guarani x CRB,00 a 10,10,1.53,5.3,Green
1303,2022-10-11,Copenhagen x City,20 a 30,10,1.57,5.7,Green
828,2022-08-31,Al-Arab Doha x Al Duhail,20 aos 30,10,1.62,6.2,Green
636,2022-08-20,Gijón x Andorra,20 ao 30,10,1.72,7.2,Green
104,2022-07-13,Belgrano x Estudiantes,10 ao 20,10,1.57,5.7,Green
57,2022-07-09,Dynamo x Borussia,10 ao 20,10,1.53,5.3,Green


Checking for dublicates

In [567]:
football_data.duplicated().sum()

22

In [568]:
#dropping duplicates
football_data.drop_duplicates(inplace=True)

Check for missing values

In [569]:
football_data.isnull().sum()

Date           0
Match          0
Bet Type      88
Bet Amount     0
Odds           0
Profit         0
Win/Loss       0
dtype: int64

In [570]:
football_data['Bet Type'].unique()

array(['60 ao 70', '10 ao 20', '20 ao 30', 'Gols', '00 ao 10',
       'Escanteio', 'Vencedor', '50 ao 60', '00 a 10', '30 ao 40', nan,
       'Cartões', '70 ao 80', 'Asiáticos', '00 a 10 ', 'Multipla',
       'aos 30', '20 aos 30', '10 aos 20', '60 aos 70', '20 a 30',
       '10 a 20', '60 a 70', '80 a 90', '70 a 80', '30 a 40', '50 a 60',
       'Múltipla', '20 a 30 ', '2 marcam', 'gols', 'Mult', 'Pedro',
       '60 a 70 ', 'Gols duplo'], dtype=object)

replaced null with zero

In [571]:
football_data.fillna(0, inplace=True)

In [572]:
football_data['Bet Type'].value_counts()

Bet Type
00 a 10       257
20 ao 30      193
60 ao 70      174
Gols          148
10 ao 20      147
20 a 30       125
10 a 20       103
60 a 70        92
0              88
00 ao 10       85
30 a 40        22
30 ao 40       17
70 a 80        15
50 a 60        11
50 ao 60       10
70 ao 80       10
Asiáticos       8
80 a 90         8
20 a 30         5
20 aos 30       4
Multipla        4
aos 30          2
2 marcam        2
60 aos 70       1
10 aos 20       1
00 a 10         1
Cartões         1
Vencedor        1
Múltipla        1
Escanteio       1
gols            1
Mult            1
Pedro           1
60 a 70         1
Gols duplo      1
Name: count, dtype: int64

Normalize the data lowercased, stripped, and accents removed


In [573]:
print("Unique Bet Type:", football_data['Bet Type'].nunique())


Unique Bet Type: 35


In [574]:
def normalize_bet_type(value):
    if pd.isna(value):
        return value
    return unidecode.unidecode(str(value).strip().lower())

football_data['Bet Type'] = football_data['Bet Type'].apply(normalize_bet_type)

Standardize the time ranges from "10 ao 20" or 10 a 20 to 10-20

In [575]:
def standardize_time_ranges(value):
    if isinstance(value, str):
        return (
            value.replace(" a ", "-")
                 .replace(" ao ", "-")
                 .replace(" aos ", "-")
                 .strip()
        )
    return value
football_data['Bet Type'] = football_data['Bet Type'].apply(standardize_time_ranges)

In [576]:
football_data['Bet Type'].value_counts()

Bet Type
00-10         343
20-30         327
60-70         268
10-20         251
gols          149
0              88
30-40          39
70-80          25
50-60          21
asiaticos       8
80-90           8
multipla        5
2 marcam        2
aos 30          2
pedro           1
mult            1
cartoes         1
vencedor        1
escanteio       1
gols duplo      1
Name: count, dtype: int64

In [577]:
#remove damaged data
football_data = football_data[football_data['Bet Type'] != 'Pedro']

Group rare bet types into "Others"

In [578]:
threshold = 10

value_counts = football_data['Bet Type'].value_counts()

rare_types = value_counts[value_counts < threshold].index

def group_rare_bet_types(bet_type):
    if bet_type in rare_types:
        return 'Other'
    return bet_type

football_data['Bet Type'] = football_data['Bet Type'].apply(group_rare_bet_types)

In [579]:
football_data['Bet Type'].value_counts()

Bet Type
00-10    343
20-30    327
60-70    268
10-20    251
gols     149
0         88
30-40     39
Other     31
70-80     25
50-60     21
Name: count, dtype: int64

In [580]:
print("Unique Bet Type:", football_data['Bet Type'].nunique())
#reduced from 35 to 10

Unique Bet Type: 10


In [581]:
football_data.dtypes

Date           object
Match          object
Bet Type       object
Bet Amount      int64
Odds          float64
Profit        float64
Win/Loss       object
dtype: object

splits the column "Match" to be "Home Team" and "Away Team". 

Checked sample data with https://www.football-data.org/coverage that it matches. Home and Away team

In [582]:
# Preview rows where 'x' might be missing
football_data[~football_data['Match'].str.contains('x', case=False, na=False)]


Unnamed: 0,Date,Match,Bet Type,Bet Amount,Odds,Profit,Win/Loss
1216,2022-10-06,Múltipla 4,Other,10,2.75,17.5,Green


In [583]:
#remove damaged data
#football_data = football_data[football_data['Match'] != 'Múltipla 4']

#football_data[['Home Team', 'Away Team']] = football_data['Match'].str.split(r'\s*x\s*', expand=True)

# Drop match column
#football_data = football_data.drop(columns=['Match'])

# Reorder the columns
#new_order = ['Date', 'Home Team', 'Away Team', 'Bet Type', 'Bet Amount', 'Odds', 'Profit', 'Win/Loss']
##football_data = football_data[new_order]


In [584]:
football_data[['Home Team', 'Away Team']] = football_data['Match'].str.split(' x ', expand=True)
#football_data[['Home Team', 'Away Team']] = football_data['Match'].str.split(' X ', expand=True)


#drop match column
football_data = football_data.drop(columns=['Match'])

new_order = ['Date', 'Home Team', 'Away Team', 'Bet Type', 'Bet Amount', 'Odds', 'Profit', 'Win/Loss']

#Reorder the columns
football_data = football_data[new_order]

Standardize the team names

In [585]:
football_data[['Home Team', 'Away Team']].isnull().sum()


Home Team    0
Away Team    4
dtype: int64

In [586]:
football_data['Home Team'].unique()

array(['Inglaterra', 'Ayacucho', 'Lara', 'Guairena', 'Cordoba',
       'Belgrano', 'Mar', 'Cruzeiro', 'Col', 'Brusque', 'Huachipate',
       'Braga', 'Colorado', 'Dallas', 'Patchuca', 'Galaxy', 'Morelia',
       'Boca', 'Libertad', 'Inter', 'Sucre', 'Hailstorm', 'Ready',
       'Argentinos', 'Colon', 'Ceara', 'Palmeiras', 'Detroit', 'Mineros',
       'River', 'Melgar', 'Union', 'Lanus', 'Stalbaek', 'Asane',
       'Patriotas', 'Selangor', 'Aksur', 'Forest', 'Chicago', 'Vila',
       'Celaya', 'Rosário', 'Dynamo', 'Portim', 'Brighton', 'NYC',
       'Lorenzo', 'Atlanta', 'Goias', 'Platense', 'Petrolero',
       'Millionarios', 'Houston', 'Gualaceo', 'Unido', 'Pepo', 'PIF',
       'Rosemberg', 'Kalmar', 'Jorge W.', 'Tolima', 'Velez', 'Ilves',
       'Defensa', 'Bagre', 'Varnamo', 'Guabira', 'Turan', 'Din', 'Talaea',
       'El Zamalek', 'Alianza', 'Banfield', 'Austin', 'Cimarrones',
       'Cortuba', 'DC United', 'Breidablik', 'Operario', 'Resistencia',
       'Huracan', 'Estudiantes', '

In [587]:
print("Unique home teams:", football_data['Home Team'].nunique())
print("Unique away teams:", football_data['Away Team'].nunique())


Unique home teams: 790
Unique away teams: 885


In [588]:
football_data.isnull().sum()

Date          0
Home Team     0
Away Team     4
Bet Type      0
Bet Amount    0
Odds          0
Profit        0
Win/Loss      0
dtype: int64

In [589]:
#normalize team names
football_data['Home Team'] = football_data['Home Team'].str.lower().str.strip()
football_data['Away Team'] = football_data['Away Team'].str.lower().str.strip()



In [590]:
team_name_map = {
    # Fix accents and formatting
    'rosário': 'rosario',
    'atlético mg': 'atl mg',
    'atlético fênix': 'atl fênix',
    'atlético madrid': 'atletico madrid',
    'universitário': 'universitario',
    'ceará': 'ceara',
    'cuiabá': 'cuiaba',
    'san lorenzo res': 'san lorenzo',
    'banfield res': 'banfield',
    'talleres res': 'talleres',
    
    # Consolidate naming variations
    'ny rb': 'ny red bulls',
    'nyc': 'ny city',
    'ny': 'ny city',
    'charllote': 'charlotte',
    'ny red bulls': 'ny red bulls',  # unify spelling
    
    # Remove garbage/mistakes
    ##'a': None,
    '  gremio': 'gremio',
    ##'múltipla 4': 'multiple',

    # Translate or unify other known aliases
    'zurique': 'zurich',
    'guilhermo': 'guillermo',
    'vila': 'vila nova',
    'croacia': 'croatia',
    'dinamarca': 'denmark',
    'país de gales': 'wales',
    'paises baixos': 'netherlands',
    'espanha': 'spain',
    'alemanha': 'germany',
    'frança': 'france',
    'islandia': 'iceland',
    'nikobing': 'nykobing',
}
# Apply the mapping to the 'Home Team' and 'Away Team' columns
football_data['Home Team'] = football_data['Home Team'].replace(team_name_map)
football_data['Away Team'] = football_data['Away Team'].replace(team_name_map)


In [591]:
#remove damaged data
bad_teams = ['A', 'B', 'FF', 'PK', 'Múltipla 4']
football_data = football_data[
    ~football_data['Home Team'].isin(bad_teams) &
    ~football_data['Away Team'].isin(bad_teams)
]

In [592]:
print("Unique home teams:", football_data['Home Team'].nunique())
print("Unique away teams:", football_data['Away Team'].nunique())


Unique home teams: 775
Unique away teams: 795


In [593]:
football_data.isnull().sum()

Date          0
Home Team     0
Away Team     4
Bet Type      0
Bet Amount    0
Odds          0
Profit        0
Win/Loss      0
dtype: int64

In [594]:
football_data[football_data['Away Team'].isnull()]

Unnamed: 0,Date,Home Team,Away Team,Bet Type,Bet Amount,Odds,Profit,Win/Loss
364,2022-08-02,colon x indepediente,,10-20,10,1.53,5.3,Green
655,2022-08-21,leeds x chelsea,,20-30,10,1.5,5.0,Green
944,2022-09-08,sp x atl go,,10-20,10,1.44,4.4,Green
1216,2022-10-06,múltipla 4,,Other,10,2.75,17.5,Green
