In [1]:
import pandas as pd

import numpy as np

with open('result_new.txt', 'r') as f:
    data = f.readlines()

# create matches array
matches = []

# store match details
team1 = None
team2 = None
score1 = None
score2 = None
date = None

# for each line in the file we look at the data stored there
for line in data:
    line = line.strip()
    if line == "FT": # after a match or at the beginning

        if team1 and team2 and score1 is not None and score2 is not None:
            # Add the match details to the list
            matches.append((team1, score1, score2, team2, date))

        # Reset the match details
        team1 = None
        team2 = None
        score1 = None
        score2 = None

    elif "-" in line and any(char.isdigit() for char in line):
        try:
            score = line.split(" - ")
            score1 = int(score[0].strip())
            score2 = int(score[1].strip())
        except ValueError:
            # Skip lines with invalid scores
            score1 = None
            score2 = None

    elif line and line[0].isalpha() and not any(char.isdigit() for char in line):
        if team1 is None:
            team1 = line
        else:
            team2 = line
    elif line and any(char.isdigit() for char in line):
        date = line

matches_df = pd.DataFrame(matches, columns=["Team1", "Score1", "Score2", "Team2", "Date"])

matches_df["Date"] = pd.to_datetime(matches_df["Date"], format="%A %d/%m/%Y", errors='coerce')

#matches_df.head()


In [2]:
import os
ranking_files = os.listdir('rankings')

rankings= []

for ranking_file in ranking_files:
    # Extract the date from the filename
    date_str = ranking_file.split(".")[0]
    date = pd.to_datetime(date_str, format="%d%m%Y")

    # Read the ranking file
    ranking_df = pd.read_csv(os.path.join('/content/rankings', ranking_file))
    ranking_df["Date"] = date

    # Append the ranking data to the list
    rankings.append(ranking_df)

# Concatenate all ranking data into a single DataFrame
rankings_df = pd.concat(rankings, ignore_index=True)

rankings_df.head()


Unnamed: 0,Team,Ranking1,Ranking2,Ranking3,Ranking4,Date
0,Germany,863.86,376.34,143.6,218.11,2018-02-15
1,Brazil,720.93,428.18,175.41,159.21,2018-02-15
2,Portugal,744.16,308.92,190.76,114.05,2018-02-15
3,Argentina,510.42,451.62,198.8,187.52,2018-02-15
4,Belgium,626.0,300.0,229.48,169.81,2018-02-15


In [3]:
# get most appropiate date from a match day  - more relevant results regarding ranking

def closest_date(ranking_dates, match_date):
    # Filter the ranking dates before the match date
    previous_dates = ranking_dates[ranking_dates <= match_date]
    if len(previous_dates) > 0:
        return previous_dates.max()
    return None

ranking_dates = pd.to_datetime(rankings_df["Date"].unique())



In [4]:
#  merge the match data with the closest previous ranking data for each team

features = []

for index,row in matches_df.iterrows():
    team1 = row["Team1"]
    team2 = row["Team2"]
    date = row["Date"]

    # closest ranking date before the match date
    closest_ranking_date = closest_date(ranking_dates, date)

    if closest_ranking_date is not None:
        # get the closest ranking for each team
        team1_ranking = rankings_df[(rankings_df["Team"] == team1) & (rankings_df["Date"] == closest_ranking_date)]
        team2_ranking = rankings_df[(rankings_df["Team"] == team2) & (rankings_df["Date"] == closest_ranking_date)]

        if not team1_ranking.empty and not team2_ranking.empty:

            # team ranking is the sum of ranking1+ranking2+ranking3+ranking4 (verified with fifa rankings)
            team1_ranking = team1_ranking.iloc[0]['Ranking1'] + team1_ranking.iloc[0]['Ranking2'] + team1_ranking.iloc[0]['Ranking3'] + team1_ranking.iloc[0]['Ranking4']
            team2_ranking = team2_ranking.iloc[0]['Ranking1'] + team2_ranking.iloc[0]['Ranking2'] + team2_ranking.iloc[0]['Ranking3'] + team2_ranking.iloc[0]['Ranking4']

            # team1_avg_ranking = team1_ranking.iloc[0]['Ranking4']
            # team2_avg_ranking = team2_ranking.iloc[0]['Ranking4']

            # Calculate ranking difference
            ranking_diff = team1_ranking - team2_ranking

            # Get the scores
            score1 = row["Score1"]
            score2 = row["Score2"]

            # Append features
            features.append((team1, team2, score1, score2, team1_ranking, team2_ranking, ranking_diff, date))

# Create DataFrame from features
features_df = pd.DataFrame(features, columns=["Team1", "Team2", "First team score", "Second team score", "Team1_Points", "Team2_Points", "Ranking difference", "Match Date"])

# Display the features DataFrame
features_df.head()

Unnamed: 0,Team1,Team2,First team score,Second team score,Team1_Points,Team2_Points,Ranking difference,Match Date
0,Brazil,Japan,3,0,872.04,764.61,107.43,2013-06-16
1,Mexico,Italy,1,2,927.88,1096.97,-169.09,2013-06-17
2,Spain,Uruguay,2,1,1614.36,913.11,701.25,2013-06-17
3,Tahiti,Nigeria,1,6,214.97,785.25,-570.28,2013-06-19
4,Brazil,Mexico,2,0,872.04,927.88,-55.84,2013-06-20


In [5]:
# import numpy as np
# from sklearn.model_selection import train_test_split
# from keras.models import Sequential
# from keras.layers import LSTM, Dense, Dropout, BatchNormalization
# from keras.callbacks import ReduceLROnPlateau, EarlyStopping
# from keras.optimizers import Adam
# from sklearn.metrics import mean_squared_error

# features_df = features_df.sort_values(by='Match Date')

# # Add home/away performance - team 1 is the host (the left side team)
# features_df['Home_Team'] = features_df['Team1']
# features_df['Away_Team'] = features_df['Team2']

# # Calculate recent performance metrics - average of the last 5 scores for each team, excluding the current game
# features_df['Team1_Recent_Form'] = features_df.groupby('Team1')['First team score'].transform(lambda x: x.shift(1).rolling(7).mean()).fillna(0)
# features_df['Team2_Recent_Form'] = features_df.groupby('Team2')['Second team score'].transform(lambda x: x.shift(1).rolling(7).mean()).fillna(0)
# features_df['Diff_Recent_Form'] = features_df['Team1_Recent_Form'] - features_df['Team2_Recent_Form']

# # Calculate recent performance metrics - sum of the last 5 results for each team, excluding current game (+1 W, 0 D, -1 L)
# features_df['Team1_Result'] = np.where(features_df['First team score'] > features_df['Second team score'], 1,
#                                       np.where(features_df['First team score'] == features_df['Second team score'], 0, -1))
# features_df['Team2_Result'] = np.where(features_df['Second team score'] > features_df['First team score'], 1,
#                                       np.where(features_df['Second team score'] == features_df['First team score'], 0, -1))

# features_df['Team1_Recent_Result'] = features_df.groupby('Team1')['Team1_Result'].transform(lambda x: x.shift(1).rolling(7).sum()).fillna(0)
# features_df['Team2_Recent_Result'] = features_df.groupby('Team2')['Team2_Result'].transform(lambda x: x.shift(1).rolling(7).sum()).fillna(0)

# # Calculate home/away performance
# home_performance = features_df.groupby('Home_Team')['First team score'].mean()
# away_performance = features_df.groupby('Away_Team')['Second team score'].mean()

# features_df = features_df.merge(home_performance, left_on='Team1', right_index=True, suffixes=('', '_Home_Avg'))
# features_df = features_df.merge(away_performance, left_on='Team2', right_index=True, suffixes=('', '_Away_Avg'))

# # Define features and targets
# features = ['Team1_Recent_Form', 'Team2_Recent_Form', 'Diff_Recent_Form', 'Team1_Points', 'Team2_Points', 'Ranking difference', 'Team1_Recent_Result', 'Team2_Recent_Result']
# target1 = 'First team score'
# target2 = 'Second team score'

# features_df['MatchID'] = features_df.index

# # Separate features and targets
# X = features_df[features + ['MatchID']]
# y1 = features_df[target1]
# y2 = features_df[target2]

# # Split the data into train and test sets based on MatchID
# train_ids, test_ids = train_test_split(features_df['MatchID'].unique(), test_size=0.25, random_state=42)

# # Create masks for training and testing sets
# train_mask = features_df['MatchID'].isin(train_ids)
# test_mask = features_df['MatchID'].isin(test_ids)

# # Function to create sequences
# def create_sequences(data, target1, target2, sequence_length):
#     sequences = []
#     targets1 = []
#     targets2 = []
#     for i in range(len(data) - sequence_length):
#         sequences.append(data.iloc[i:i+sequence_length].values)
#         targets1.append(target1.iloc[i+sequence_length])
#         targets2.append(target2.iloc[i+sequence_length])
#     return np.array(sequences), np.array(targets1), np.array(targets2)

# sequence_length = 3
# # Generate sequences for training and testing sets
# X_train_sequences, y1_train_sequences, y2_train_sequences = create_sequences(features_df[train_mask][features], y1[train_mask], y2[train_mask], sequence_length)
# X_test_sequences, y1_test_sequences, y2_test_sequences = create_sequences(features_df[test_mask][features], y1[test_mask], y2[test_mask], sequence_length)

# # Reshape the data to be 3D [samples, timesteps, features]
# n_features = X_train_sequences.shape[2]
# X_train_sequences = X_train_sequences.reshape((X_train_sequences.shape[0], sequence_length, n_features))
# X_test_sequences = X_test_sequences.reshape((X_test_sequences.shape[0], sequence_length, n_features))

# # Define the model
# model = Sequential()
# model.add(LSTM(50, activation='relu', input_shape=(sequence_length, n_features), return_sequences=True))
# model.add(Dropout(0.25))
# model.add(BatchNormalization())
# model.add(LSTM(50, activation='relu'))
# model.add(Dropout(0.25))
# model.add(BatchNormalization())

# model.add(Dense(2))  # Output layer with 2 units, one for each team's score
# model.compile(optimizer=Adam(learning_rate=0.0001), loss='mse')

# # Callbacks for learning rate adjustment and early stopping
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0005)
# early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# # Train the model
# history = model.fit(X_train_sequences, np.column_stack((y1_train_sequences, y2_train_sequences)), epochs=50, validation_data=(X_test_sequences, np.column_stack((y1_test_sequences, y2_test_sequences))), callbacks=[reduce_lr, early_stopping])

# # Evaluate the model
# mse = model.evaluate(X_test_sequences, np.column_stack((y1_test_sequences, y2_test_sequences)))
# print("Mean Squared Error on Test Data:", mse)


In [6]:

features_df = features_df.sort_values(by='Match Date')


# Add home/away performance - team 1 is the host (the left side team)
features_df['Home_Team'] = features_df['Team1']
features_df['Away_Team'] = features_df['Team2']

# Calculate recent performance metrics - average of the last 5 scores for each team, excluding the current game
features_df['Team1_Recent_Form'] = features_df.groupby('Team1')['First team score'].transform(lambda x: x.shift(1).rolling(7).mean()).fillna(0)
features_df['Team2_Recent_Form'] = features_df.groupby('Team2')['Second team score'].transform(lambda x: x.shift(1).rolling(7).mean()).fillna(0)
features_df['Diff_Recent_Form'] = features_df['Team1_Recent_Form'] - features_df['Team2_Recent_Form']

# Calculate recent performance metrics - sum of the last 5 results for each team, excluding current game (+1 W, 0 D, -1 L)
features_df['Team1_Result'] = np.where(features_df['First team score'] > features_df['Second team score'], 1,
                                      np.where(features_df['First team score'] == features_df['Second team score'], 0, -1))
features_df['Team2_Result'] = np.where(features_df['Second team score'] > features_df['First team score'], 1,
                                      np.where(features_df['Second team score'] == features_df['First team score'], 0, -1))

features_df['Team1_Recent_Result'] = features_df.groupby('Team1')['Team1_Result'].transform(lambda x: x.shift(1).rolling(7).sum()).fillna(0)
features_df['Team2_Recent_Result'] = features_df.groupby('Team2')['Team2_Result'].transform(lambda x: x.shift(1).rolling(7).sum()).fillna(0)


# Calculate home/away performance
home_performance = features_df.groupby('Home_Team')['First team score'].mean()
away_performance = features_df.groupby('Away_Team')['Second team score'].mean()

features_df = features_df.merge(home_performance, left_on='Team1', right_index=True, suffixes=('', '_Home_Avg'))
features_df = features_df.merge(away_performance, left_on='Team2', right_index=True, suffixes=('', '_Away_Avg'))

features_df.head()

Unnamed: 0,Team1,Team2,First team score,Second team score,Team1_Points,Team2_Points,Ranking difference,Match Date,Home_Team,Away_Team,Team1_Recent_Form,Team2_Recent_Form,Diff_Recent_Form,Team1_Result,Team2_Result,Team1_Recent_Result,Team2_Recent_Result,First team score_Home_Avg,Second team score_Away_Avg
47,Costa Rica,Mexico,2,1,615.58,945.4,-329.82,2013-06-05,Costa Rica,Mexico,0.0,0.0,0.0,1,-1,0.0,0.0,1.6,1.5
1083,Costa Rica,Mexico,1,1,917.62,1045.74,-128.12,2017-09-06,Costa Rica,Mexico,1.571429,1.285714,0.285714,0,0,3.0,1.0,1.6,1.5
29,Panama,Mexico,0,0,648.91,927.88,-278.97,2013-06-08,Panama,Mexico,0.0,0.0,0.0,0,0,0.0,0.0,1.588235,1.5
279,Panama,Mexico,2,1,601.39,879.55,-278.16,2013-07-29,Panama,Mexico,0.0,0.0,0.0,1,-1,0.0,0.0,1.588235,1.5
680,Panama,Mexico,0,0,551.23,1000.96,-449.73,2016-11-16,Panama,Mexico,1.428571,2.142857,-0.714286,0,0,1.0,4.0,1.588235,1.5


In [7]:
# support vector regression

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

teams_information = features_df[['Team1_Recent_Form', 'Team2_Recent_Form', 'Team1_Recent_Result', 'Team2_Recent_Result', 'Diff_Recent_Form', 'First team score_Home_Avg', 'Second team score_Away_Avg', 'Team1_Points', 'Team2_Points']]
first_team_score = features_df["First team score"]
second_team_score = features_df["Second team score"]


# Convert Series to NumPy array and reshape
teams_information = teams_information.values
first_team_score = first_team_score.values.reshape(-1, 1).ravel()
second_team_score = second_team_score.values.reshape(-1, 1).ravel()

regressor1 = SVR(kernel  = 'rbf', C=10)

x_train1, x_test1, y_train1, y_test1 = train_test_split(teams_information, first_team_score, test_size=0.2, random_state=42)

x_train2, x_test2, y_train2, y_test2 = train_test_split(teams_information, second_team_score, test_size=0.2, random_state=42)

regressor1.fit(x_train1, y_train1)

y_pred1 = regressor1.predict(x_test1)

regressor2 = SVR(kernel  = 'rbf', C=10)
regressor2.fit(x_train2, y_train2)
y_pred2 = regressor2.predict(x_test2)

# find accuracy
ms1 = mean_squared_error(y_test1, y_pred1)
ms2 = mean_squared_error(y_test2, y_pred2)

print("Mean squared error for First Team:", ms1)
print("Mean squared error for Second Team:", ms2)

#accuracy for the two models

accuracy1 = regressor1.score(x_test1, y_test1)
accuracy2 = regressor2.score(x_test2, y_test2)

print("Accuracy for First Team:", accuracy1)
print("Accuracy for Second Team:", accuracy2)


Mean squared error for First Team: 2.3377330358446584
Mean squared error for Second Team: 1.7123129740283003
Accuracy for First Team: 0.22549895731465086
Accuracy for Second Team: 0.23518640016699133


In [8]:
features_df.head()

Unnamed: 0,Team1,Team2,First team score,Second team score,Team1_Points,Team2_Points,Ranking difference,Match Date,Home_Team,Away_Team,Team1_Recent_Form,Team2_Recent_Form,Diff_Recent_Form,Team1_Result,Team2_Result,Team1_Recent_Result,Team2_Recent_Result,First team score_Home_Avg,Second team score_Away_Avg
47,Costa Rica,Mexico,2,1,615.58,945.4,-329.82,2013-06-05,Costa Rica,Mexico,0.0,0.0,0.0,1,-1,0.0,0.0,1.6,1.5
1083,Costa Rica,Mexico,1,1,917.62,1045.74,-128.12,2017-09-06,Costa Rica,Mexico,1.571429,1.285714,0.285714,0,0,3.0,1.0,1.6,1.5
29,Panama,Mexico,0,0,648.91,927.88,-278.97,2013-06-08,Panama,Mexico,0.0,0.0,0.0,0,0,0.0,0.0,1.588235,1.5
279,Panama,Mexico,2,1,601.39,879.55,-278.16,2013-07-29,Panama,Mexico,0.0,0.0,0.0,1,-1,0.0,0.0,1.588235,1.5
680,Panama,Mexico,0,0,551.23,1000.96,-449.73,2016-11-16,Panama,Mexico,1.428571,2.142857,-0.714286,0,0,1.0,4.0,1.588235,1.5


In [9]:
# use rolling averages on random forest regressor

from sklearn.ensemble import RandomForestRegressor

# define parameters for grid searching - chose the ones with best results
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False]
# }



# Split the data into features (X) and target (y)
features = ['Team1_Recent_Form', 'Team2_Recent_Form', 'Team1_Recent_Result', 'Team2_Recent_Result', 'Diff_Recent_Form', 'First team score_Home_Avg', 'Second team score_Away_Avg', 'Team1_Points', 'Team2_Points', 'Ranking difference']
target1 = 'First team score'
target2 = 'Second team score'

# Separate features and targets
X = features_df[features]
y1 = features_df[target1]
y2 = features_df[target2]

# from sklearn.preprocessing import MinMaxScaler
# sc_x = MinMaxScaler()
# sc_y = MinMaxScaler()

# X = sc_x.fit_transform(X)
# y1 = sc_y.fit_transform(y1.values.reshape(-1, 1)).ravel()
# y2 = sc_y.fit_transform(y2.values.reshape(-1, 1)).ravel()


# Split the data into train and test sets for the first team's score
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y1, test_size=0.2, random_state=42)

# Split the data into train and test sets for the second team's score
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=42)

model1 = RandomForestRegressor(n_estimators=400, random_state=30)
model1.fit(X_train1, y_train1)
y_pred1 = model1.predict(X_test1)
mse1 = mean_squared_error(y_test1, y_pred1)
print("Mean Squared Error for First Team:", mse1)

# accuracy for first team
accuracy1 = model1.score(X_test1, y_test1)
print("Accuracy for First Team:", accuracy1)

# Train Random Forest for the second team's score
model2 = RandomForestRegressor(n_estimators=120, random_state=40)
model2.fit(X_train2, y_train2)
y_pred2 = model2.predict(X_test2)
mse2 = mean_squared_error(y_test2, y_pred2)
print("Mean Squared Error for Second Team:", mse2)

# accuracy for second team
accuracy2 = model2.score(X_test2, y_test2)
print("Accuracy for Second Team:", accuracy2)




Mean Squared Error for First Team: 2.343657706766917
Accuracy for First Team: 0.2235360882716514
Mean Squared Error for Second Team: 1.4967551691729324
Accuracy for Second Team: 0.3314664279446762


In [14]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline

# Assuming features_df is your DataFrame
features = ['Team1_Recent_Form',
            'Team2_Recent_Form', 'Team1_Recent_Result',
            'Team2_Recent_Result', 'Diff_Recent_Form',
            'First team score_Home_Avg', 'Second team score_Away_Avg',
            'Team1_Points', 'Team2_Points', 'Ranking difference']
target1 = 'First team score'
target2 = 'Second team score'

# Add a unique identifier for each match (assuming Date column is available)
features_df['MatchID'] = features_df.index

# Separate features and targets
X = features_df[features + ['MatchID']]
y1 = features_df[target1]
y2 = features_df[target2]

# Split the data into train and test sets based on MatchID
train_ids, test_ids = train_test_split(X['MatchID'].unique(), test_size=0.1, random_state=42)

# Create the train and test sets
train_mask = X['MatchID'].isin(train_ids)
test_mask = X['MatchID'].isin(test_ids)

X_train1, X_test1 = X[train_mask].drop(columns=['MatchID']), X[test_mask].drop(columns=['MatchID'])
y_train1, y_test1 = y1[train_mask], y1[test_mask]

X_train2, X_test2 = X[train_mask].drop(columns=['MatchID']), X[test_mask].drop(columns=['MatchID'])
y_train2, y_test2 = y2[train_mask], y2[test_mask]

# Check that no MatchID is in both training and testing sets
assert len(set(train_ids) & set(test_ids)) == 0, "Data leakage detected! Some MatchIDs are in both train and test sets."


# Create a pipeline with PolynomialFeatures and Ridge
pipeline = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),
    ('scaler', StandardScaler()),
    ('regressor', Ridge())
])

# Define parameter grid for GridSearchCV specific to Ridge regression
param_grid = {
    'poly__degree': [1, 2, 3],
    'regressor__alpha': [0.01, 0.1, 1.0, 10, 100],
    'regressor__fit_intercept': [True, False],
    'regressor__copy_X': [True, False],
}

# Use RepeatedKFold for more robust cross-validation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

grid_search1 = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=1, scoring='r2')
grid_search2 = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=1, scoring='r2')

# Fit the model on the first team's score data
grid_search1.fit(X_train1, y_train1)
y_pred1 = grid_search1.predict(X_test1)

# Fit the model on the second team's score data
grid_search2.fit(X_train2, y_train2)
y_pred2 = grid_search2.predict(X_test2)

# Convert the predictions to the closest integer, for negative score, put 0
y_pred1 = np.round(y_pred1).astype(int)
y_pred2 = np.round(y_pred2).astype(int)


y_pred1 = np.maximum(y_pred1, 0)
y_pred2 = np.maximum(y_pred2, 0)

# Calculate mean squared error for both models
mse1 = mean_squared_error(y_test1, y_pred1)
mse2 = mean_squared_error(y_test2, y_pred2)

# Calculate accuracy for both models
accuracy1 = r2_score(y_test1, y_pred1)
accuracy2 = r2_score(y_test2, y_pred2)

# Print the results
print("Mean squared error for First Team:", mse1)
print("Mean squared error for Second Team:", mse2)
print("Accuracy for First Team:", accuracy1)
print("Accuracy for Second Team:", accuracy2)

# Print the best parameters
print("Best parameters for First Team model:", grid_search1.best_params_)
print("Best parameters for Second Team model:", grid_search2.best_params_)

Mean squared error for First Team: 2.2706766917293235
Mean squared error for Second Team: 1.4887218045112782
Accuracy for First Team: 0.320372250423012
Accuracy for Second Team: 0.40763901385639745
Best parameters for First Team model: {'poly__degree': 3, 'regressor__alpha': 100, 'regressor__copy_X': True, 'regressor__fit_intercept': True}
Best parameters for Second Team model: {'poly__degree': 2, 'regressor__alpha': 100, 'regressor__copy_X': True, 'regressor__fit_intercept': True}


In [15]:
# Calculate the differences between the actual and predicted scores
diff1 = y_test1 - y_pred1
diff2 = y_test2 - y_pred2

# Calculate the percentages for positive and negative differences
def calculate_percentages(diffs, threshold):
    within_threshold = np.abs(diffs) == threshold
    positive = diffs > 0
    negative = diffs < 0
    total = len(diffs)
    positive_percentage = np.sum(within_threshold & positive) / total * 100
    negative_percentage = np.sum(within_threshold & negative) / total * 100
    return positive_percentage, negative_percentage

thresholds = [0, 1, 2, 3, 4, 5]

print("\nFirst Team Score Prediction Accuracy:")
for t in thresholds:
    pos_percentage, neg_percentage = calculate_percentages(diff1, t)
    if t == 0:
        print(f"Exact predictions: {accuracy1 * 100:.2f}%")
    else:
        print(f"±{t} goals: Positive: {pos_percentage:.2f}%, Negative: {neg_percentage:.2f}%")

print("\nSecond Team Score Prediction Accuracy:")
for t in thresholds:
    pos_percentage, neg_percentage = calculate_percentages(diff2, t)
    if t == 0:
        print(f"Exact predictions: {accuracy2 * 100:.2f}%")
    else:
        print(f"±{t} goals: Positive: {pos_percentage:.2f}%, Negative: {neg_percentage:.2f}%")


First Team Score Prediction Accuracy:
Exact predictions: 32.04%
±1 goals: Positive: 18.05%, Negative: 28.57%
±2 goals: Positive: 8.27%, Negative: 10.53%
±3 goals: Positive: 3.01%, Negative: 2.26%
±4 goals: Positive: 0.75%, Negative: 0.00%
±5 goals: Positive: 0.75%, Negative: 0.00%

Second Team Score Prediction Accuracy:
Exact predictions: 40.76%
±1 goals: Positive: 21.80%, Negative: 25.56%
±2 goals: Positive: 6.02%, Negative: 4.51%
±3 goals: Positive: 3.76%, Negative: 1.50%
±4 goals: Positive: 0.75%, Negative: 0.00%
±5 goals: Positive: 0.00%, Negative: 0.00%


In [16]:
with open('predict.txt', 'r') as f:
    data = f.readlines()

data = data[::-1]

# Parse the predictions into a list of tuples
prediction_pairs = [tuple(line.strip().split(',')) for line in data]

# Prepare the new DataFrame
predict_df = pd.DataFrame(columns=[
    'Team1_Recent_Form',
    'Team2_Recent_Form', 'Team1_Recent_Result',
    'Team2_Recent_Result', 'Diff_Recent_Form',
    'First team score_Home_Avg', 'Second team score_Away_Avg',
    'Team1_Points', 'Team2_Points', 'Ranking difference'
])

# Set the match date to 2018-01-01
match_date = pd.to_datetime('2018-12-31')

for team1, team2 in prediction_pairs:
    # Get the last game data for team1 and team2
    team1_last_game = features_df[features_df['Team1'] == team1].sort_values(by='Match Date', ascending=False).iloc[0]
    team2_last_game = features_df[features_df['Team2'] == team2].sort_values(by='Match Date', ascending=False).iloc[0]

    # Extract the required fields from the last game data
    team1_points = team1_last_game['Team1_Points']
    team2_points = team2_last_game['Team2_Points']
    ranking_diff = team1_points - team2_points

    team1_recent_form = team1_last_game['Team1_Recent_Form']
    team2_recent_form = team2_last_game['Team2_Recent_Form']
    diff_recent_form = team1_recent_form - team2_recent_form

    team1_result = team1_last_game['Team1_Result']
    team2_result = team2_last_game['Team2_Result']

    team1_recent_result = team1_last_game['Team1_Recent_Result']
    team2_recent_result = team2_last_game['Team2_Recent_Result']

    team1_home_avg = team1_last_game['First team score_Home_Avg']
    team2_away_avg = team2_last_game['Second team score_Away_Avg']

    # Append the new row to the predict_df DataFrame
    predict_df.loc[-1] = [
        team1_recent_form,
        team2_recent_form,
        team1_recent_result,
        team2_recent_result,
        diff_recent_form,
        team1_home_avg,
        team2_away_avg,
        team1_points,
        team2_points,
        ranking_diff,
    ]

    predict_df.index = predict_df.index + 1
    predict_df = predict_df.sort_index()

# Display the resulting DataFrame
predict_df.head(20)

Unnamed: 0,Team1_Recent_Form,Team2_Recent_Form,Team1_Recent_Result,Team2_Recent_Result,Diff_Recent_Form,First team score_Home_Avg,Second team score_Away_Avg,Team1_Points,Team2_Points,Ranking difference
0,0.0,2.714286,0.0,1.0,-2.714286,1.666667,2.545455,560.85,600.62,-39.77
1,1.571429,1.857143,5.0,-1.0,-0.285714,1.625,1.473684,818.44,1043.45,-225.01
2,2.285714,1.714286,6.0,4.0,0.571429,2.375,1.333333,635.04,893.02,-257.98
3,3.142857,2.285714,4.0,2.0,0.857143,2.2,2.0,1386.1,1183.7,202.4
4,2.714286,1.142857,6.0,-1.0,1.571429,2.6,1.5625,1207.88,714.13,493.75
5,2.714286,1.142857,0.0,-1.0,1.571429,2.2,1.6,1325.28,931.49,393.79
6,2.142857,1.428571,1.0,1.0,0.714286,1.733333,1.444444,1102.6,848.23,254.37
7,1.285714,1.285714,3.0,3.0,0.0,1.444444,1.545455,1012.81,720.91,291.9
8,1.285714,0.0,2.0,0.0,1.285714,1.6,2.0,935.87,789.28,146.59
9,3.714286,1.428571,6.0,2.0,2.285714,3.4375,1.5,1606.05,1085.37,520.68


In [18]:
# Fit the model on the first team's score data
y_pred_test1 = grid_search1.predict(predict_df)

# Fit the model on the second team's score data
y_pred_test2 = grid_search2.predict(predict_df)

# Convert the predictions to the closest integer
y_pred_test1 = np.round(y_pred_test1).astype(int)
y_pred_test2 = np.round(y_pred_test2).astype(int)

y_pred_test1 = np.maximum(y_pred_test1, 0)
y_pred_test2 = np.maximum(y_pred_test2, 0)

with open('predict.txt', 'r') as f:
    final_teams = f.readlines()

with open('predictions.txt', 'w') as w:
    w.write('Team1,Team2,Score1,Score2')
    for i in range(len(final_teams)):
        team1, team2 = final_teams[i].strip().split(',')
        score1 = y_pred_test1[i]
        score2 = y_pred_test2[i]
        w.write(f'\n{team1},{team2},{score1},{score2}')

print('Written predictions successfuly')

Written predictions successfuly
