In [1]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
from data_processing import process_data

In [3]:
def evaluate_prediction(y_test, y_predicted):
    acc = accuracy_score(y_test, y_predicted)
    f1 = f1_score(y_test, y_predicted, average='macro')
    prec = precision_score(y_test, y_predicted, average='macro', zero_division=0)
    rec = precision_score(y_test, y_predicted, average='macro')
    confMat = confusion_matrix(y_test, y_predicted)
    
    return {"accuracy": acc, "f1": f1, "precision": prec, "recall": rec, "confusion_matrix": confMat}

In [4]:
data = process_data()
data.shape



(19348, 612)

In [5]:
y = data['match_outcome']
X = data.drop(columns=['match_outcome'], axis=1)

In [6]:
X_train, X_test, y_train, y_test= train_test_split(X, y, train_size = 0.8, random_state=40)

In [7]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
svd = TruncatedSVD(n_components = 400)

In [9]:
svd.fit(X_train)

In [10]:
X_train_PCA = svd.transform(X_train)
X_test_PCA = svd.transform(X_test)

## Logistic Regression

### l1 regularization

In [11]:
model = LogisticRegression(solver='liblinear', penalty='l1', max_iter=500)
param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0, 100.0],
    'class_weight': [None, 'balanced'],
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_PCA, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_predicted = best_model.predict(X_test_PCA)
evaluation = evaluate_prediction(y_test, y_predicted)
print(evaluation)

Best Parameters: {'C': 0.1, 'class_weight': None}
{'accuracy': 0.5028423772609819, 'f1': 0.39095747117429464, 'precision': 0.43272255956819716, 'recall': 0.43272255956819716, 'confusion_matrix': array([[ 456,   57,  560],
       [ 246,   56,  688],
       [ 306,   67, 1434]])}


### l2 regularization

In [12]:
model = LogisticRegression(solver='lbfgs', penalty='l2', max_iter=500)
param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0, 100.0],
    'class_weight': [None, 'balanced'],
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_PCA, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_predicted = best_model.predict(X_test_PCA)
evaluation = evaluate_prediction(y_test, y_predicted)
print(evaluation)

Best Parameters: {'C': 0.1, 'class_weight': None}
{'accuracy': 0.4979328165374677, 'f1': 0.3973993406466607, 'precision': 0.4233805011239215, 'recall': 0.4233805011239215, 'confusion_matrix': array([[ 462,   88,  523],
       [ 251,   74,  665],
       [ 315,  101, 1391]])}


### elasticnet regularization

In [13]:
model = LogisticRegression(solver='saga', penalty='elasticnet', max_iter=5000)
param_grid = {
    'C': [0.1, 1.0, 10.0],
    'l1_ratio':[0, 0.5, 1]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_PCA, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_predicted = best_model.predict(X_test_PCA)
evaluation = evaluate_prediction(y_test, y_predicted)
print(evaluation)



Best Parameters: {'C': 0.1, 'l1_ratio': 1}
{'accuracy': 0.5031007751937985, 'f1': 0.3986280585832096, 'precision': 0.43726290270785473, 'recall': 0.43726290270785473, 'confusion_matrix': array([[ 460,   67,  546],
       [ 244,   71,  675],
       [ 307,   84, 1416]])}


## k-Nearest Neighbors

In [14]:
model = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'], # handling imbalansed data in this algorithm
    'metric': ['euclidean', 'manhattan']
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_PCA, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_predicted = best_model.predict(X_test_PCA)
evaluation = evaluate_prediction(y_test, y_predicted)
print(evaluation)

Best Parameters: {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}
{'accuracy': 0.4682170542635659, 'f1': 0.40608913752563097, 'precision': 0.4108507517544589, 'recall': 0.4108507517544589, 'confusion_matrix': array([[ 434,  199,  440],
       [ 284,  166,  540],
       [ 364,  231, 1212]])}


## Support Vector Machine

In [11]:
model = SVC(kernel='linear')
param_grid = {
    'C': [0.1, 1.0]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_PCA, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_predicted = best_model.predict(X_test_PCA)
evaluation = evaluate_prediction(y_test, y_predicted)
print(evaluation)

Best Parameters: {'C': 1.0}
{'accuracy': 0.5077519379844961, 'f1': 0.3690068313726808, 'precision': 0.4106410832806297, 'recall': 0.4106410832806297, 'confusion_matrix': array([[ 431,   22,  620],
       [ 240,   15,  735],
       [ 263,   25, 1519]])}


## Decision Tree

In [13]:
model = DecisionTreeClassifier()
param_grid = {
    'max_depth': [5, 10, 15, 20],    # Maximum depth of the tree
    'class_weight': [None, 'balanced']
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_PCA, y_train)
print("Best Parameters:, 100.0", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_predicted = best_model.predict(X_test_PCA)
evaluation = evaluate_prediction(y_test, y_predicted)
print(evaluation)

Best Parameters:, 100.0 {'class_weight': None, 'max_depth': 5}
{'accuracy': 0.4669250645994832, 'f1': 0.2771774960899296, 'precision': 0.34228811933696673, 'recall': 0.34228811933696673, 'confusion_matrix': array([[ 148,    1,  924],
       [ 136,    1,  853],
       [ 146,    3, 1658]])}


## Random Forest

In [14]:
model = RandomForestClassifier()
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15, 20],
    'class_weight': [None, 'balanced'],
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_PCA, y_train)
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_predicted = best_model.predict(X_test_PCA)
evaluation = evaluate_prediction(y_test, y_predicted)
print(evaluation)

Best Parameters: {'class_weight': None, 'max_depth': 10, 'n_estimators': 150}
{'accuracy': 0.49844961240310076, 'f1': 0.31670493701250885, 'precision': 0.4382388473761414, 'recall': 0.4382388473761414, 'confusion_matrix': array([[ 200,   14,  859],
       [ 103,   11,  876],
       [  76,   13, 1718]])}
