<a href="https://colab.research.google.com/github/miguelcasadinho/colab/blob/main/soccer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data cleaning

In [45]:
# Import the libraries
import numpy as np
import pandas as pd

In [46]:

# Import the dataset
dataset = pd.read_csv('games.csv')

# Filter by league
dataset = dataset[dataset['competition_id'].isin(['ES1'])]

# Ensure 'date' is in datetime format
dataset['date'] = pd.to_datetime(dataset['date'])

# Feature Engineering for Machine Learning
dataset['year'] = dataset['date'].dt.year
dataset['month'] = dataset['date'].dt.month
dataset['day'] = dataset['date'].dt.day
dataset['day_of_week'] = dataset['date'].dt.dayofweek  # Monday=0, Sunday=6

# Keep only specific columns
columns_to_keep = ['season','round','year','month','day','day_of_week','date','home_club_name','away_club_name','home_club_position','away_club_position','home_club_goals','away_club_goals']
dataset = dataset[columns_to_keep]

# Create a new column 'winner'
dataset['winner'] = dataset.apply(
    lambda row: row['home_club_name'] if row['home_club_goals'] > row['away_club_goals']
    else row['away_club_name'] if row['home_club_goals'] < row['away_club_goals']
    else 'draw', axis=1
)

# Clean the `round` column
dataset['round'] = dataset['round'].str.split('.').str[0]

# Sort by the 'date' column
dataset = dataset.sort_values(by='date').reset_index(drop=True)

# Drop the date column
dataset.drop('date', axis=1, inplace=True)

print(dataset.shape)

(4709, 13)


In [49]:
# Handling rows with NaN values
# Identify rows with NaN values
#dataset_rows_with_nan = dataset[dataset.isnull().any(axis=1)]
#print("Rows with NaN in dataset:", dataset_rows_with_nan)

# Deleting rows with NaN values
#dataset.dropna(inplace=True)

#print(dataset.shape)

In [72]:
# Part 1 - Data Preprocessing
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
print(X)
#print(y)

# Encode season, round and target variable winner
from sklearn.preprocessing import LabelEncoder
# Define columns to encode
columns_to_encode = [0, 1]  # Season and Round

# Initialize LabelEncoder
le = LabelEncoder()

# Encode each column
for col in columns_to_encode:
    X[:, col] = le.fit_transform(X[:, col])

# Encode the target variable 'winner' (y)
y = le.fit_transform(y)
#print(X)
#print(y)

# One Hot Encoding the "home_club_name", "away_club_name" columns
from sklearn.compose import ColumnTransformer # type: ignore
from sklearn.preprocessing import OneHotEncoder # type: ignore
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [7, 8])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)
'''
#Feature scaling
from sklearn.preprocessing import MinMaxScaler
# Define the columns to scale
columns_to_scale = [2]
# Initialize the scaler
sc = MinMaxScaler(feature_range=(0, 1))
# Apply scaling
X[:,columns_to_scale] = sc.fit_transform(X[:,columns_to_scale])
print(X)
'''

[[2012 '1' 2012 ... 14.0 2 1]
 [2012 '1' 2012 ... 13.0 2 1]
 [2012 '1' 2012 ... 8.0 0 1]
 ...
 [2024 '15' 2024 ... 7.0 1 1]
 [2024 '19' 2024 ... 1.0 1 5]
 [2024 '19' 2024 ... 2.0 2 1]]


ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.

In [11]:
# Save to csv
dataset.to_csv('dataset.csv')