<a href="https://colab.research.google.com/github/miguelcasadinho/colab/blob/main/soccer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data cleaning

In [1]:
# Import the libraries
import numpy as np
import pandas as pd

In [None]:

# Import the dataset
dataset = pd.read_csv('games.csv')

# Filter by league
dataset = dataset[dataset['competition_id'].isin(['ES1'])]

# Keep only specific columns
columns_to_keep = ['season','round','date','home_club_name','away_club_name','home_club_position','away_club_position','home_club_goals','away_club_goals']
dataset = dataset[columns_to_keep]

# Create a new column 'winner'
dataset['winner'] = dataset.apply(
    lambda row: row['home_club_name'] if row['home_club_goals'] > row['away_club_goals']
    else row['away_club_name'] if row['home_club_goals'] < row['away_club_goals']
    else 'draw', axis=1
)

# Clean the `round` column
dataset['round'] = dataset['round'].str.split('.').str[0]

# Ensure 'date' is in datetime format
dataset['date'] = pd.to_datetime(dataset['date'])

# Sort by the 'date' column
dataset = dataset.sort_values(by='date').reset_index(drop=True)

print(dataset.shape)

In [None]:
# Handling rows with NaN values
# Identify rows with NaN values
dataset_rows_with_nan = dataset[dataset.isnull().any(axis=1)]
print("Rows with NaN in dataset:", dataset_rows_with_nan)

# Deleting rows with NaN values
#dataset.dropna(inplace=True)

#print(dataset.shape)

In [None]:
# Part 1 - Data Preprocessing
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
#print(X)
#print(y)

# One Hot Encoding the "home_club_name", "away_club_name" columns
from sklearn.compose import ColumnTransformer # type: ignore
from sklearn.preprocessing import OneHotEncoder # type: ignore
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
# print(X)

In [11]:
# Save to csv
dataset.to_csv('dataset.csv')