In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
base_path = './data'

path_player_away_train = f'{base_path}/Train_Data/train_away_player_statistics_df.csv'
path_player_home_train = f'{base_path}/Train_Data/train_home_player_statistics_df.csv'
path_team_away_train = f'{base_path}/Train_Data/train_away_team_statistics_df.csv'
path_team_home_train = f'{base_path}/Train_Data/train_home_team_statistics_df.csv'
path_target = f'{base_path}/Y_train_1rknArQ.csv'
path_target_sup = f'{base_path}/benchmark_and_extras/Y_train_supp.csv'

In [3]:
train_target = pd.read_csv(path_target, sep=',')  
train_team = pd.read_csv('train_team.csv', sep = ',' ) 

X = train_team.drop(columns=['ID', 'HOME_LEAGUE', 'HOME_TEAM_NAME', 'AWAY_LEAGUE', 'AWAY_TEAM_NAME'])
y = train_target[['HOME_WINS',	'DRAW',	'AWAY_WINS']].values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def cross_entropy_loss(y_true, y_pred):
    n_samples = y_true.shape[0]
    loss = -np.sum(y_true * np.log(y_pred + 1e-15)) / n_samples
    return loss

def gradient_descent(X, y, lr=0.01, n_iter=1000):
    n_samples, n_features = X.shape
    n_classes = y.shape[1]
    
    weights = np.zeros((n_features, n_classes))
    bias = np.zeros((1, n_classes))
    
    for i in range(n_iter):
        linear_model = np.dot(X, weights) + bias
        y_pred = softmax(linear_model)
        
        loss = cross_entropy_loss(y, y_pred)
        
        dw = np.dot(X.T, (y_pred - y)) / n_samples
        db = np.sum(y_pred - y, axis=0, keepdims=True) / n_samples
        
        weights -= lr * dw
        bias -= lr * db
        
        if i % 100 == 0:
            print(f"Iteration {i}: Loss = {loss}")
    
    return weights, bias

def predict(X, weights, bias):
    linear_model = np.dot(X, weights) + bias
    probabilities = softmax(linear_model)
    return np.argmax(probabilities, axis=1) 


In [7]:
weights, bias = gradient_descent(X_train, y_train, lr=0.1, n_iter=1000)

Iteration 0: Loss = nan
Iteration 100: Loss = nan
Iteration 200: Loss = nan
Iteration 300: Loss = nan
Iteration 400: Loss = nan
Iteration 500: Loss = nan
Iteration 600: Loss = nan
Iteration 700: Loss = nan
Iteration 800: Loss = nan
Iteration 900: Loss = nan


In [8]:
y_pred = predict(X_test, weights, bias)
y_test = np.argmax(y_test, axis=1)

In [9]:
accuracy_score(y_test, y_pred)

0.440065014221861

In [10]:
path_team_away_test = f'{base_path}/Test_Data/test_away_team_statistics_df.csv'
path_team_home_test = f'{base_path}/Test_Data/test_home_team_statistics_df.csv'

test_team_home = pd.read_csv(path_team_home_test, sep=',')
test_team_away = pd.read_csv(path_team_away_test, sep=',')

test_team_home.columns = 'HOME_' + test_team_home.columns
test_team_away.columns = 'AWAY_' + test_team_away.columns

test_team =  pd.concat([test_team_home, test_team_away.iloc[:, 1:]],join='inner',axis=1)
test_team = test_team.rename(columns={'HOME_ID':'ID'})

In [11]:
predictions = predict(test_team.drop(columns=['ID']), weights, bias)

In [12]:
p = []
for pred in predictions:
    _p = np.zeros(3, dtype=np.int32)
    _p[pred - 1] = 1 
    p.append(_p)

In [13]:
pred_sub = pd.DataFrame(data=p, columns=['HOME_WINS', 'DRAW', 'AWAY_WINS'])

In [14]:
pred_sub = pd.concat([test_team['ID'], pred_sub],join='inner',axis=1)

In [15]:
pred_sub.to_csv('./submission_Reglog.csv', index=False)