In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model, metrics, preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, f1_score, accuracy_score


In [11]:
def split_to_train_and_test(X, y, test_ratio, rand_state):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=rand_state)
    return X_train, X_test, y_train, y_test

In [24]:
categorys_to_predict = ['TEAM_A_POINTS','TEAM_B_POINTS']
irrelevant_categories = ['DATE','GAME_ID','TEAM_A','TEAM_B','TEAM_A_ID','TEAM_B_ID']

file_name = './games-united.csv'
df = pd.read_csv(file_name).drop(columns=irrelevant_categories)

X = df.drop(columns=categorys_to_predict)
y = df[categorys_to_predict]

X_train, X_test, y_train, y_test = split_to_train_and_test(X, y, 0.3, 41)


In [30]:
X_train,y_train,X_test,y_test

(      TEAM_A_SCORE  TEAM_B_SCORE
 61       77.225285     68.154076
 2333     73.420374     79.331900
 567      64.015418     66.012107
 293      75.414780     71.085154
 2178     73.806966     75.414780
 ...            ...           ...
 243      79.271891     80.303052
 321      74.117156     73.043957
 1104     69.965595     77.637377
 931      75.414780     80.303052
 1984     73.600570     66.692896
 
 [1666 rows x 2 columns],
       TEAM_A_POINTS  TEAM_B_POINTS
 61              102             99
 2333            120            103
 567             123            111
 293              94            103
 2178            102            106
 ...             ...            ...
 243             102            103
 321             101            108
 1104            127            114
 931              99             94
 1984             96            106
 
 [1666 rows x 2 columns],
       TEAM_A_SCORE  TEAM_B_SCORE
 1897     80.303052     68.154076
 982      82.805261     67.977461
 1

In [15]:
# Predicting Using the trained linear regression model:

trained_lin_reg_model = LinearRegression().fit(X_train, y_train)


# Arranging predicted data sides to actual data:
predicted_results = trained_lin_reg_model.predict(X_test)

predicted_A = []
predicted_B = []
for result in predicted_results:
    predicted_A.append(int(result[0]))
    predicted_B.append(int(result[1]))

df_result_eval = pd.DataFrame(columns=['ACTUAL_A','PREDICTED_A','ACTUAL_B','PREDICTED_B'])
df_result_eval['ACTUAL_A'] = y_test['TEAM_A_POINTS']
df_result_eval['ACTUAL_B'] = y_test['TEAM_B_POINTS']
df_result_eval['PREDICTED_A'] = predicted_A
df_result_eval['PREDICTED_B'] = predicted_B

# Displaying results:
df_result_eval


Unnamed: 0,ACTUAL_A,PREDICTED_A,ACTUAL_B,PREDICTED_B
1897,111,111,94,111
982,119,111,112,111
1605,115,110,129,112
1095,101,110,109,113
2166,75,111,92,113
...,...,...,...,...
626,117,111,97,112
951,126,111,106,112
1121,121,111,106,112
2360,98,111,123,113


In [16]:
# Calculating the mean squared error of the prediction:

#metrics.mean_squared_error(y_test, predicted_results,squared=False)

print(f"mean squared error for Team A points: {metrics.mean_squared_error(df_result_eval['ACTUAL_A'], df_result_eval['PREDICTED_A'],squared=False)}")
print(f"mean squared error for Team B points: {metrics.mean_squared_error(df_result_eval['ACTUAL_B'], df_result_eval['PREDICTED_B'],squared=False)}")

mean squared error for Team A points: 12.505460991976616
mean squared error for Team B points: 12.419714156126458


In [17]:
# Calculating actual and predicted winners to 'df_result_eval':

original_winners = []
predicted_winners = []
original_diffs = []
predicted_diffs = []

for row in df_result_eval.iterrows():
    if row[1]['ACTUAL_A'] > row[1]['ACTUAL_B']:
        original_winner= 'A'
    elif row[1]['ACTUAL_A'] < row[1]['ACTUAL_B']:
        original_winner= 'B'
    else:
        original_winner = 'DRAW'
 
    if row[1]['PREDICTED_A'] > row[1]['PREDICTED_B']:
        predicted_winner= 'A'
    elif row[1]['PREDICTED_A'] < row[1]['PREDICTED_B']:
        predicted_winner= 'B'
    else:
        predicted_winner = 'DRAW'
    original_winners.append(original_winner)
    predicted_winners.append(predicted_winner)
    original_diffs.append(abs((row[1]['ACTUAL_A'])-(row[1]['ACTUAL_B'])))
    predicted_diffs.append(abs((row[1]['PREDICTED_A'])-(row[1]['PREDICTED_B'])))

df_result_eval['ORIGINAL_WINNER'] = original_winners
df_result_eval['PREDICTED_WINNER'] = predicted_winners
df_result_eval['ORIGINAL_DIFF'] = original_diffs
df_result_eval['PREDICTED_DIFF'] = predicted_diffs

df_result_eval


Unnamed: 0,ACTUAL_A,PREDICTED_A,ACTUAL_B,PREDICTED_B,ORIGINAL_WINNER,PREDICTED_WINNER,ORIGINAL_DIFF,PREDICTED_DIFF
1897,111,111,94,111,A,DRAW,17,0
982,119,111,112,111,A,DRAW,7,0
1605,115,110,129,112,B,B,14,2
1095,101,110,109,113,B,B,8,3
2166,75,111,92,113,B,B,17,2
...,...,...,...,...,...,...,...,...
626,117,111,97,112,A,B,20,1
951,126,111,106,112,A,B,20,1
1121,121,111,106,112,A,B,15,1
2360,98,111,123,113,B,B,25,2


In [18]:
def calculate_winner_prediction_evaluation(df_result_eval):
    count = 0
    for row in df_result_eval.iterrows():
        if row[1]['ORIGINAL_WINNER'] == row[1]['PREDICTED_WINNER']:
            count = count+1
    return count/df_result_eval.shape[0]

In [19]:
winner_prediction_evaluation = calculate_winner_prediction_evaluation(df_result_eval)
print("winner_prediction_evaluation: {0:.0%}".format(winner_prediction_evaluation))

winner_prediction_evaluation: 49%


In [20]:
diff_prediction_evaluation = df_result_eval['PREDICTED_DIFF'].mean()/df_result_eval['ORIGINAL_DIFF'].mean()
print("diff_prediction_evaluation: {0:.0%}".format(diff_prediction_evaluation))

diff_prediction_evaluation: 14%
