In [None]:
import pandas as pd
import csv

### Import raw data

In [2]:
raw_csv_path = "../data/raw.csv"
df = pd.read_csv(raw_csv_path)
df.head()

Unnamed: 0,placement,name,points,total_games,wins,draws,loses,goals_scored,goals_conceded,goals_difference,season
0,1,Cruzeiro,100,46,31,7,8,102,47,55,2003
1,2,Santos,87,46,25,12,9,93,60,33,2003
2,3,São Paulo,78,46,22,12,12,81,67,14,2003
3,4,São Caetano,74,46,19,14,13,53,37,16,2003
4,5,Coritiba,73,46,21,10,15,67,58,9,2003


### Check DataFrame shape and infos

The number of rows will depend directly on the range of seasons that the data has. Furthermore, it's important to say that 2003 and 2004 seasons had **24 clubs**, 2005 had **22 clubs** and, since 2006, Campeonato Brasileiro has been played with **20 clubs**.

In [3]:
df.shape

(430, 11)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430 entries, 0 to 429
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   placement         430 non-null    int64 
 1   name              430 non-null    object
 2   points            430 non-null    int64 
 3   total_games       430 non-null    int64 
 4   wins              430 non-null    int64 
 5   draws             430 non-null    int64 
 6   loses             430 non-null    int64 
 7   goals_scored      430 non-null    int64 
 8   goals_conceded    430 non-null    int64 
 9   goals_difference  430 non-null    int64 
 10  season            430 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 37.1+ KB


### Check if total number of matches, goal difference and points stats are correct

In [9]:
df[df['total_games'] != df.wins+df.draws+df.loses]

Unnamed: 0,placement,name,points,total_games,wins,draws,loses,goals_scored,goals_conceded,goals_difference,season


In [10]:
df[df['goals_difference'] != df.goals_scored-df.goals_conceded]

Unnamed: 0,placement,name,points,total_games,wins,draws,loses,goals_scored,goals_conceded,goals_difference,season
19,20,Grêmio,50,46,13,11,22,54,68,-12,2003
35,12,Coritiba,62,46,15,17,14,53,48,-5,2004
41,18,São Caetano,53,46,23,8,15,65,49,15,2004


In [11]:
df['goals_difference'] = df.goals_scored-df.goals_conceded

In [12]:
df[df['goals_difference'] != df.goals_scored-df.goals_conceded]

Unnamed: 0,placement,name,points,total_games,wins,draws,loses,goals_scored,goals_conceded,goals_difference,season


In [15]:
df[df['points'] != df['wins'] * 3 + df['draws']]

Unnamed: 0,placement,name,points,total_games,wins,draws,loses,goals_scored,goals_conceded,goals_difference,season
3,4,São Caetano,74,46,19,14,13,53,37,16,2003
5,6,Internacional,72,46,20,10,16,59,57,2,2003
14,15,Corinthians,59,46,15,12,19,61,63,-2,2003
17,18,Juventude,53,46,12,14,20,55,70,-15,2003
18,19,Fluminense,52,46,13,11,22,52,77,-25,2003
20,21,Ponte Preta,50,46,11,18,17,63,73,-10,2003
21,22,Paysandu,49,46,15,12,19,74,77,-3,2003
41,18,São Caetano,53,46,23,8,15,65,49,16,2004
169,20,Grêmio Prudente,28,38,7,10,21,39,64,-25,2010
268,19,Santa Cruz,28,38,8,7,23,45,69,-24,2016


> **Obs**: The DataFrame above is correct. All of these clubs lost or gained points for off-field reasons

### Look for clubs that have changed their name, but are the same (if exists)

In [None]:
df.name.unique()

array(['Cruzeiro', 'Santos', 'São Paulo', 'São Caetano', 'Coritiba',
       'Internacional', 'Atlético Mineiro', 'Flamengo', 'Goiás', 'Paraná',
       'Figueirense', 'Atlético Paranaense', 'Guarani', 'Criciúma',
       'Corinthians', 'Vitória', 'Vasco da Gama', 'Juventude',
       'Fluminense', 'Grêmio', 'Ponte Preta', 'Paysandu', 'Fortaleza',
       'Bahia', 'Palmeiras', 'Botafogo', 'Brasiliense', 'Santa Cruz',
       'Sport', 'Náutico', 'América de Natal', 'Portuguesa', 'Ipatinga',
       'Avaí', 'Grêmio Barueri', 'Athletico Paranaense', 'Santo André',
       'Ceará', 'Atlético Goianiense', 'Grêmio Prudente',
       'América Mineiro', 'Chapecoense', 'Joinville', 'CSA',
       'Red Bull Bragantino', 'Cuiabá'], dtype=object)

##### There are two clubs that need to be refactored:
- **Atlético Paranaense** (currently is called **Athletico Paranaense**)
- **Grêmio Prudente** (In short: The club was called Grêmio Barueri, changed the name to Grêmio Prudente, but after a year changed it back to **Grêmio Barueri**)

In [22]:
df.loc[df['name'] == 'Atlético Paranaense', 'name'] = 'Athletico Paranaense'

In [23]:
df.loc[df['name'] == 'Atlético Paranaense']

Unnamed: 0,placement,name,points,total_games,wins,draws,loses,goals_scored,goals_conceded,goals_difference,season


In [24]:
df.loc[df['name'] == 'Grêmio Prudente', 'name'] = 'Grêmio Barueri'

In [25]:
df.loc[df['name'] == 'Grêmio Prudente']

Unnamed: 0,placement,name,points,total_games,wins,draws,loses,goals_scored,goals_conceded,goals_difference,season


In [26]:
df.name.unique()

array(['Cruzeiro', 'Santos', 'São Paulo', 'São Caetano', 'Coritiba',
       'Internacional', 'Atlético Mineiro', 'Flamengo', 'Goiás', 'Paraná',
       'Figueirense', 'Athletico Paranaense', 'Guarani', 'Criciúma',
       'Corinthians', 'Vitória', 'Vasco da Gama', 'Juventude',
       'Fluminense', 'Grêmio', 'Ponte Preta', 'Paysandu', 'Fortaleza',
       'Bahia', 'Palmeiras', 'Botafogo', 'Brasiliense', 'Santa Cruz',
       'Sport', 'Náutico', 'América de Natal', 'Portuguesa', 'Ipatinga',
       'Avaí', 'Grêmio Barueri', 'Santo André', 'Ceará',
       'Atlético Goianiense', 'América Mineiro', 'Chapecoense',
       'Joinville', 'CSA', 'Red Bull Bragantino', 'Cuiabá'], dtype=object)

### Calculate 'Winning Percentage' of each row and add it to the DataFrame

In [31]:
winning_percentage_series = ((df['points'] / (df['total_games'] * 3)) * 100).round(2)
df['winning_percentage'] = winning_percentage_series

In [32]:
df.head()

Unnamed: 0,placement,name,points,total_games,wins,draws,loses,goals_scored,goals_conceded,goals_difference,season,winning_percentage
0,1,Cruzeiro,100,46,31,7,8,102,47,55,2003,72.46
1,2,Santos,87,46,25,12,9,93,60,33,2003,63.04
2,3,São Paulo,78,46,22,12,12,81,67,14,2003,56.52
3,4,São Caetano,74,46,19,14,13,53,37,16,2003,53.62
4,5,Coritiba,73,46,21,10,15,67,58,9,2003,52.9


### Look for null or duplicated rows

In [46]:
df[df.isna().any(axis=1)].sum()

placement               0
name                    0
points                  0
total_games             0
wins                    0
draws                   0
loses                   0
goals_scored            0
goals_conceded          0
goals_difference        0
season                  0
winning_percentage    0.0
dtype: object

In [47]:
df[df.duplicated()].sum()

placement               0
name                    0
points                  0
total_games             0
wins                    0
draws                   0
loses                   0
goals_scored            0
goals_conceded          0
goals_difference        0
season                  0
winning_percentage    0.0
dtype: object

### Save transformed data

In [48]:
path_to_save = "../data/transformed.csv"
df.to_csv(path_to_save, index=False)