# Premier League Score Difference Prediction LGBM

In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import seaborn as sns
import random
import optuna
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

In [None]:
data0 = pd.read_csv("../input/premier-league-matches-20142020/matchesall2014-2020.csv")
print(data0.columns.tolist())
data0

In [None]:
data0.info()

In [None]:
data0['HOME-AWAY']=data0[['Home Team','Away Team']].apply(lambda x: x['Home Team']+'-'+x['Away Team'], axis=1)
data0['SCORE-DIFF']=data0['Home Team Goals Scored']-data0['Away Team Goals Scored']
data0

In [None]:
print(data0['SCORE-DIFF'].value_counts())

In [None]:
data0['SCORE-DIFF']=data0['SCORE-DIFF'].apply(lambda x: np.where(x>3,4,x)).apply(lambda x: np.where(x<-3,-4,x))
print(data0['SCORE-DIFF'].value_counts())

In [None]:
data1=data0

In [None]:
df=data0
from sklearn.preprocessing import LabelEncoder
for c in df.columns:
    if df[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(df[c].values))
        df[c] = lbl.transform(df[c].values)
data1=df

In [None]:
m=len(data1)
M=list(range(m))
random.seed(2021)
random.shuffle(M)
drop0=['Unnamed: 0','Score','Half Time Score','Home Team Goals Scored','Away Team Goals Scored','Home Team Goals Conceeded','Away Team Goals Conceeded','SCORE-DIFF']
dataX=data1.drop(drop0,axis=1)
dataY0=data1['SCORE-DIFF']

In [None]:
columns=dataX.columns.to_list()
print(columns)

In [None]:
trainX=dataX.iloc[M[0:(m//4)*3]]
trainY0=dataY0[M[0:(m//4)*3]]

testX=dataX.iloc[M[(m//4)*3:]]
testY0=dataY0[M[(m//4)*3:]]

In [None]:
def objective(trial,data=trainX,target=trainY0):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param =   {
        'num_leaves': trial.suggest_int('num_leaves', 2, 200),
        'objective': trial.suggest_categorical('objective',['regression','rmse']),  
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
        "boosting": "gbdt",
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 1e-3),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 3, 10),
        "bagging_fraction": trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        "feature_fraction": trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        "verbosity": trial.suggest_int("verbosity", 1, 10),
    }
    model = lgb.LGBMClassifier(**param)      
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=64)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.trials_dataframe()

In [None]:
# shows the scores from all trials
optuna.visualization.plot_optimization_history(study)

In [None]:
# interactively visualizes the hyperparameters and scores
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
# shows the evolution of the search
optuna.visualization.plot_slice(study)

In [None]:
# parameter interactions on an interactive chart.
optuna.visualization.plot_contour(study, params=['num_leaves','objective'])

In [None]:
# Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
# Visualize empirical distribution function
optuna.visualization.plot_edf(study)

In [None]:
Best_trial=study.best_trial.params
print(Best_trial)

In [None]:
preds = np.zeros((len(testX)))
kf = KFold(n_splits=5,random_state=48,shuffle=True)
for trn_idx, test_idx in kf.split(trainX[columns],trainY0):
    X_tr,X_val=trainX[columns].iloc[trn_idx],trainX[columns].iloc[test_idx]
    y_tr,y_val=trainY0.iloc[trn_idx],trainY0.iloc[test_idx]
    model = lgb.LGBMClassifier(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    preds+=model.predict(testX[columns])/kf.n_splits   ###### predict_proba
    rmse=mean_squared_error(y_val, model.predict(X_val),squared=False)
    print(rmse)

In [None]:
model

In [None]:
from sklearn.metrics import classification_report
y_pred=model.predict(testX)
ground = np.array(testY0)

In [None]:
print(classification_report(ground,y_pred))

In [None]:
nan

In [None]:
nan