<a href="https://colab.research.google.com/github/miguelcasadinho/colab/blob/main/soccer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data cleaning

In [1]:
# Import the libraries
import numpy as np
import pandas as pd

In [10]:

# Import the dataset
dataset = pd.read_csv('games.csv')

# Filter by league
dataset = dataset[dataset['competition_id'].isin(['ES1'])]

# Keep only specific columns
columns_to_keep = ['season','round','date','home_club_name','away_club_name','home_club_position','away_club_position','home_club_goals','away_club_goals']
dataset = dataset[columns_to_keep]

# Create a new column 'winner'
dataset['winner'] = dataset.apply(
    lambda row: row['home_club_name'] if row['home_club_goals'] > row['away_club_goals']
    else row['away_club_name'] if row['home_club_goals'] < row['away_club_goals']
    else 'draw', axis=1
)

# Clean the `round` column
dataset['round'] = dataset['round'].str.split('.').str[0]

# Ensure 'date' is in datetime format
dataset['date'] = pd.to_datetime(dataset['date'])

# Sort by the 'date' column
dataset = dataset.sort_values(by='date').reset_index(drop=True)

print(dataset.shape)

(4709, 10)


In [6]:
# Handling rows with NaN values
# Identify rows with NaN values
dataset_rows_with_nan = dataset[dataset.isnull().any(axis=1)]
print("Rows with NaN in dataset:", dataset_rows_with_nan)

# Deleting rows with NaN values
#dataset.dropna(inplace=True)

#print(dataset.shape)

Rows with NaN in dataset: Empty DataFrame
Columns: [season, round, date, home_club_name, away_club_name, home_club_position, away_club_position, home_club_goals, away_club_goals, winner]
Index: []


In [12]:
# Part 1 - Data Preprocessing
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
#print(X)
#print(y)

[[2012 '1' Timestamp('2012-08-18 00:00:00') ... 14.0 2 1]
 [2012 '1' Timestamp('2012-08-18 00:00:00') ... 13.0 2 1]
 [2012 '1' Timestamp('2012-08-18 00:00:00') ... 8.0 0 1]
 ...
 [2024 '15' Timestamp('2024-12-02 00:00:00') ... 7.0 1 1]
 [2024 '19' Timestamp('2024-12-03 00:00:00') ... 1.0 1 5]
 [2024 '19' Timestamp('2024-12-04 00:00:00') ... 2.0 2 1]]


In [11]:
# Save to csv
dataset.to_csv('dataset.csv')