In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Load the dataset
file_path = '/content/matches.csv'
data = pd.read_csv(file_path)

In [None]:
# Step 1: Data Cleaning
# Drop irrelevant or unused columns
columns_to_drop = ['Unnamed: 0', 'notes', 'match report']
data = data.drop(columns=columns_to_drop)

In [None]:
data

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,formation,referee,sh,sot,dist,fk,pk,pkatt,season,team
0,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,4-3-3,Anthony Taylor,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
1,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,4-3-3,Graham Scott,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
2,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,4-3-3,Martin Atkinson,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
3,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,4-3-3,Paul Tierney,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
4,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,4-3-3,Jonathan Moss,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1384,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,...,3-4-1-2,Andre Marriner,8.0,1.0,17.4,0.0,0.0,0.0,2021,Sheffield United
1385,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,...,3-5-2,Simon Hooper,7.0,0.0,11.4,1.0,0.0,0.0,2021,Sheffield United
1386,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,...,3-4-3,Jonathan Moss,10.0,3.0,17.0,0.0,0.0,0.0,2021,Sheffield United
1387,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,...,3-4-3,Robert Jones,11.0,1.0,16.0,1.0,0.0,0.0,2021,Sheffield United


In [None]:
# Handle missing values
data['attendance'] = data['attendance'].fillna(data['attendance'].median())  # Fill attendance with median
data = data.dropna()  # Drop rows with missing values in critical columns

In [None]:
data

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,formation,referee,sh,sot,dist,fk,pk,pkatt,season,team
0,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,4-3-3,Anthony Taylor,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
1,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,4-3-3,Graham Scott,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
2,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,4-3-3,Martin Atkinson,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
3,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,4-3-3,Paul Tierney,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
4,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,4-3-3,Jonathan Moss,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1384,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,...,3-4-1-2,Andre Marriner,8.0,1.0,17.4,0.0,0.0,0.0,2021,Sheffield United
1385,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,...,3-5-2,Simon Hooper,7.0,0.0,11.4,1.0,0.0,0.0,2021,Sheffield United
1386,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,...,3-4-3,Jonathan Moss,10.0,3.0,17.0,0.0,0.0,0.0,2021,Sheffield United
1387,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,...,3-4-3,Robert Jones,11.0,1.0,16.0,1.0,0.0,0.0,2021,Sheffield United


In [None]:
# Convert 'date' to datetime
data.loc[:, 'date'] = pd.to_datetime(data['date'])

In [None]:
# Step 2: Feature Engineering
# Map 'result' to numeric (Win=1, Draw=0, Loss=-1)
result_mapping = {'W': 1, 'D': 0, 'L': -1}
data.loc[:, 'result_numeric'] = data['result'].map(result_mapping)

In [None]:
# Create a 'goal_difference' feature
data.loc[:, 'goal_difference'] = data['gf'] - data['ga']

In [None]:
# Encode categorical variables
!pip install --upgrade scikit-learn
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

categorical_features = ['venue', 'opponent', 'formation', 'team']
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_features = encoder.fit_transform(data[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))
data = pd.concat([data, encoded_df], axis=1)
data = data.drop(columns=categorical_features)



In [None]:
# Step 3: Define Features and Target
features = data.drop(columns=['result', 'result_numeric', 'date', 'time', 'comp', 'round', 'day', 'captain', 'referee'])
target = data['result_numeric']

In [None]:
features = features[target.notna()]

In [None]:
target = target[target.notna()]

In [None]:
# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['gf', 'ga', 'xg', 'xga', 'poss', 'attendance', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt', 'goal_difference']
features[numerical_features] = scaler.fit_transform(features[numerical_features])


In [None]:
# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [None]:
# Step 5: Model Training
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

In [None]:
# Step 6: Model Evaluation
y_pred = model.predict(X_test)

In [None]:
# Print metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

        -1.0       1.00      1.00      1.00       120
         0.0       1.00      1.00      1.00        55
         1.0       1.00      1.00      1.00       103

    accuracy                           1.00       278
   macro avg       1.00      1.00      1.00       278
weighted avg       1.00      1.00      1.00       278


Confusion Matrix:
 [[120   0   0]
 [  0  55   0]
 [  0   0 103]]
