<a href="https://colab.research.google.com/github/miguelcasadinho/colab/blob/main/soccer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data cleaning

In [24]:
# Import the libraries
import numpy as np
import pandas as pd

In [39]:

# Import the dataset
dataset = pd.read_csv('games.csv')

# Filter by league
dataset = dataset[dataset['competition_id'].isin(['ES1'])]

# Ensure 'date' is in datetime format
dataset['date'] = pd.to_datetime(dataset['date'])

# Feature Engineering for Machine Learning
dataset['year'] = dataset['date'].dt.year
dataset['month'] = dataset['date'].dt.month
dataset['day'] = dataset['date'].dt.day
dataset['day_of_week'] = dataset['date'].dt.dayofweek  # Monday=0, Sunday=6

# Keep only specific columns
columns_to_keep = ['season','round','year','month','day','day_of_week','date','home_club_name','away_club_name','home_club_position','away_club_position','home_club_goals','away_club_goals']
dataset = dataset[columns_to_keep]

# Create a new column 'winner'
dataset['winner'] = dataset.apply(
    lambda row: row['home_club_name'] if row['home_club_goals'] > row['away_club_goals']
    else row['away_club_name'] if row['home_club_goals'] < row['away_club_goals']
    else 'draw', axis=1
)

# Clean the `round` column
dataset['round'] = dataset['round'].str.split('.').str[0]

# Sort by the 'date' column
dataset = dataset.sort_values(by='date').reset_index(drop=True)

# Drop the date column
dataset.drop('date', axis=1, inplace=True)

print(dataset.shape)

(4709, 13)


In [27]:
# Handling rows with NaN values
# Identify rows with NaN values
dataset_rows_with_nan = dataset[dataset.isnull().any(axis=1)]
print("Rows with NaN in dataset:", dataset_rows_with_nan)

# Deleting rows with NaN values
#dataset.dropna(inplace=True)

#print(dataset.shape)

Rows with NaN in dataset: Empty DataFrame
Columns: [season, round, year, month, home_club_name, away_club_name, home_club_position, away_club_position, home_club_goals, away_club_goals, winner]
Index: []


In [34]:
# Part 1 - Data Preprocessing
#Feature scaling
sc = MinMaxScaler(feature_range = (0, 1))



X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
#print(X)
#print(y)

# One Hot Encoding the "home_club_name", "away_club_name" columns
from sklearn.compose import ColumnTransformer # type: ignore
from sklearn.preprocessing import OneHotEncoder # type: ignore
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4, 5])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)


  (0, 26)	1.0
  (0, 42)	1.0
  (0, 64)	2012.0
  (0, 65)	1.0
  (0, 66)	2012.0
  (0, 67)	8.0
  (0, 68)	5.0
  (0, 69)	14.0
  (0, 70)	2.0
  (0, 71)	1.0
  (1, 18)	1.0
  (1, 55)	1.0
  (1, 64)	2012.0
  (1, 65)	1.0
  (1, 66)	2012.0
  (1, 67)	8.0
  (1, 68)	4.0
  (1, 69)	13.0
  (1, 70)	2.0
  (1, 71)	1.0
  (2, 17)	1.0
  (2, 46)	1.0
  (2, 64)	2012.0
  (2, 65)	1.0
  (2, 66)	2012.0
  :	:
  (4706, 67)	12.0
  (4706, 68)	11.0
  (4706, 69)	7.0
  (4706, 70)	1.0
  (4706, 71)	1.0
  (4707, 18)	1.0
  (4707, 41)	1.0
  (4707, 64)	2024.0
  (4707, 65)	19.0
  (4707, 66)	2024.0
  (4707, 67)	12.0
  (4707, 68)	6.0
  (4707, 69)	1.0
  (4707, 70)	1.0
  (4707, 71)	5.0
  (4708, 0)	1.0
  (4708, 51)	1.0
  (4708, 64)	2024.0
  (4708, 65)	19.0
  (4708, 66)	2024.0
  (4708, 67)	12.0
  (4708, 68)	4.0
  (4708, 69)	2.0
  (4708, 70)	2.0
  (4708, 71)	1.0


In [11]:
# Save to csv
dataset.to_csv('dataset.csv')