<a href="https://colab.research.google.com/github/miguelcasadinho/colab/blob/main/soccer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data cleaning

In [79]:
# Import the libraries
import numpy as np
import pandas as pd

In [80]:

# Import the dataset
dataset = pd.read_csv('games.csv')

# Filter by league
dataset = dataset[dataset['competition_id'].isin(['ES1'])]

# Ensure 'date' is in datetime format
dataset['date'] = pd.to_datetime(dataset['date'])

# Feature Engineering for Machine Learning
dataset['year'] = dataset['date'].dt.year
dataset['month'] = dataset['date'].dt.month
dataset['day'] = dataset['date'].dt.day
dataset['day_of_week'] = dataset['date'].dt.dayofweek  # Monday=0, Sunday=6

# Keep only specific columns
columns_to_keep = ['season','round','year','month','day','day_of_week','date','home_club_name','away_club_name','home_club_position','away_club_position','home_club_goals','away_club_goals']
dataset = dataset[columns_to_keep]

# Create a new column 'winner'
dataset['winner'] = dataset.apply(
    lambda row: row['home_club_name'] if row['home_club_goals'] > row['away_club_goals']
    else row['away_club_name'] if row['home_club_goals'] < row['away_club_goals']
    else 'draw', axis=1
)

# Clean the `round` column
dataset['round'] = dataset['round'].str.split('.').str[0]

# Sort by the 'date' column
dataset = dataset.sort_values(by='date').reset_index(drop=True)

# Drop the date column
dataset.drop('date', axis=1, inplace=True)

print(dataset[:5])
print(dataset.shape)

   season round  year  month  day  day_of_week  \
0    2012     1  2012      8   18            5   
1    2012     1  2012      8   18            5   
2    2012     1  2012      8   18            5   
3    2012     1  2012      8   19            6   
4    2012     1  2012      8   19            6   

                        home_club_name  \
0           Sevilla Fútbol Club S.A.D.   
1  Real Club Deportivo Mallorca S.A.D.   
2     Real Club Celta de Vigo S. A. D.   
3                Futbol Club Barcelona   
4                           Levante UD   

                                     away_club_name  home_club_position  \
0           Getafe Club de Fútbol S.A.D. Team Dubai                 5.0   
1  Reial Club Deportiu Espanyol de Barcelona S.A.D.                 4.0   
2                                         Málaga CF                16.0   
3                    Real Sociedad de Fútbol S.A.D.                 1.0   
4                    Club Atlético de Madrid S.A.D.                12.0

In [49]:
# Handling rows with NaN values
# Identify rows with NaN values
#dataset_rows_with_nan = dataset[dataset.isnull().any(axis=1)]
#print("Rows with NaN in dataset:", dataset_rows_with_nan)

# Deleting rows with NaN values
#dataset.dropna(inplace=True)

#print(dataset.shape)

In [83]:
# Part 1 - Data Preprocessing
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
print(X)
#print(y)

#Feature scaling
from sklearn.preprocessing import MinMaxScaler
# Scale 'year','month','day','day_of_week','home_club_position','away_club_position','home_club_goals','away_club_goals'
columns_to_scale = [2,3,4,5,8,9,10,11]
# Initialize the scaler
sc = MinMaxScaler(feature_range=(0, 1))
# Apply scaling
X[:,columns_to_scale] = sc.fit_transform(X[:,columns_to_scale])
#print(X)

# Encode season, round, and target variable winner
from sklearn.preprocessing import LabelEncoder
# Define columns to encode
columns_to_encode = [0, 1,]  # Season, Round

# Initialize LabelEncoder
le = LabelEncoder()

# Encode each column
for col in columns_to_encode:
    X[:, col] = le.fit_transform(X[:, col])

# Encode the target variable 'winner' (y)
y = le.fit_transform(y)
#print(X)
#print(y)

# One Hot Encoding the "home_club_name", "away_club_name" columns
from sklearn.compose import ColumnTransformer # type: ignore
from sklearn.preprocessing import OneHotEncoder # type: ignore
# Indices for 'home_club_name' and 'away_club_name'
# These indices need to be updated since we've already label encoded them
one_hot_columns = [7, 8]

ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), one_hot_columns)],
    remainder='passthrough'
)

X = np.array(ct.fit_transform(X))




[[2012 '1' 2012 ... 14.0 2 1]
 [2012 '1' 2012 ... 13.0 2 1]
 [2012 '1' 2012 ... 8.0 0 1]
 ...
 [2024 '15' 2024 ... 7.0 1 1]
 [2024 '19' 2024 ... 1.0 1 5]
 [2024 '19' 2024 ... 2.0 2 1]]


ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.

In [11]:
# Save to csv
dataset.to_csv('dataset.csv')