In [33]:
import pandas as pd
import warnings
import os
import concurrent.futures

warnings.filterwarnings('ignore')
df = pd.read_csv('data_live_engineer_filtered.csv', encoding='utf-8')


In [34]:
df[(df['homeTeam']== 'France') & (df['awayTeam']== 'Kazakhstan')].sort_values(by='goal_home').head(45)

Unnamed: 0,minute,homeTeam,awayTeam,goal_home,goal_away,shots_home,shots_away,blockedShots_home,blockedShots_away,league,...,passRisk_home,passRisk_away,TotalCards_home,TotalCards_away,05ht_home,15ht_home,25ht_home,05ht_away,15ht_away,25ht_away


In [35]:
df['goal_home'].unique()

array([0.])

In [36]:
import unicodedata

# Define a function to normalize text using unicodedata
def normalize_text_unicode(text):
    text = unicodedata.normalize('NFKD', text)
    text = text.encode('ascii', 'ignore').decode('utf-8')
    return text.lower()

# Apply the function to the relevant columns
df['league'] = df['league'].apply(normalize_text_unicode)
df['homeTeam'] = df['homeTeam'].apply(normalize_text_unicode)
df['awayTeam'] = df['awayTeam'].apply(normalize_text_unicode)
df.head()

Unnamed: 0,minute,homeTeam,awayTeam,goal_home,goal_away,shots_home,shots_away,blockedShots_home,blockedShots_away,league,...,passRisk_home,passRisk_away,TotalCards_home,TotalCards_away,05ht_home,15ht_home,25ht_home,05ht_away,15ht_away,25ht_away
0,15.53,club guarani,cd san jose,0.0,0.0,7.0,0.0,1.0,0.0,south america - copa libertadores,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16.32,club guarani,cd san jose,0.0,0.0,7.0,0.0,1.0,0.0,south america - copa libertadores,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,22.6,club universitario de deportes,carabobo fc,0.0,0.0,6.0,1.0,1.0,0.0,south america - copa libertadores,...,0.044435,0.018179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,23.24,club universitario de deportes,carabobo fc,0.0,0.0,6.0,1.0,1.0,0.0,south america - copa libertadores,...,0.044435,0.018179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11.6,barcelona sc,ca progreso,0.0,0.0,1.0,1.0,0.0,0.0,south america - copa libertadores,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
df['05ht_home'].unique()

array([0.])

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Encode categorical variables
label_encoders = {}
for column in ['homeTeam', 'awayTeam', 'league']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Split the df into features and target variable
X = df.drop(['result', 'match_id'], axis=1)
y = df['result']

# Split the df into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Random Forest classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [39]:
X.columns

Index(['minute', 'homeTeam', 'awayTeam', 'goal_home', 'goal_away',
       'shots_home', 'shots_away', 'blockedShots_home', 'blockedShots_away',
       'league', 'corners_home', 'corners_away', 'redcards_home',
       'redcards_away', 'shotsOffgoal_home', 'shotsOffgoal_away',
       'shotsOngoal_home', 'shotsOngoal_away', 'yellowcards_home',
       'yellowcards_away', 'fouls_home', 'fouls_away', 'offsides_home',
       'offsides_away', 'tackles_home', 'tackles_away', 'possessiontime_away',
       'possessiontime_home', 'f_attack_home', 'f_defensive_away',
       'f_defensive_home', 'f_attack_away', 'win_rate_home', 'loss_rate_home',
       'draw_rate_home', 'win_rate_away', 'loss_rate_away', 'draw_rate_away',
       'shotAccuracy_home', 'shotAccuracy_away', 'attackPressureOverTime_home',
       'attackPressureOverTime_away', 'aggrressionOverTime_home',
       'aggresssionOverTime_away', 'defensiveEfficacy_home',
       'defensiveEfficacy_away', 'taklesOverTime_home', 'taklesOverTime_awa

In [40]:
print(report)

              precision    recall  f1-score   support

           0       0.88      0.80      0.84      2696
           1       0.81      0.89      0.85      2681

    accuracy                           0.85      5377
   macro avg       0.85      0.85      0.85      5377
weighted avg       0.85      0.85      0.85      5377



In [41]:
import joblib

# Save the label encoders, scaler, and model
joblib.dump(label_encoders, '../models/label_encoders.pkl')
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(model, '../models/random_forest_model.pkl')

['../models/random_forest_model.pkl']