In [1]:
import pandas as pd
from sklearn.linear_model import LassoCV
from statsmodels.tools import add_constant

def read_data(group_name):
    file_path = f"C:/Users/koala/Documents/МГУ/6 семестр/Untitled Folder/new/{group_name}/{group_name}_group_all_seasons_dummy_updated_new.csv"
    return pd.read_csv(file_path, index_col=['id', 'Season'])

def build_lasso_regression(df, target, features):
    X = add_constant(df[features])  # Add constant to the features
    y = df[target]
    lasso = LassoCV(cv=5, random_state=42).fit(X, y)
    return lasso

group_names = ['red', 'yellow', 'green']
regressors = {
    'red': ['Age', 'Age^2', 'Gls', 'SoT%', 'Cmp%', 'Ast', 'Tkl%', 'Clr', 'Att.5', 'Succ%', 'PPM', 'CrdY', 'CrdR', 'Won%', 'def', 'mid', 'forw'],
    'yellow': ['Age', 'xG', 'Blocks', 'Tkl+Int', 'Err', 'Def Pen', 'Att.5', 'Succ%', 'Mn/Sub', 'xG+/-90', 'OG', 'Won%', 'def', 'mid', 'forw', 'Age^2'],
    'green': ['Age', 'Sh/90', 'np:G-xG', 'Dist', 'npxG/Sh', 'Cmp%', 'A-xAG', 'Crs', 'TO', 'Tkl%', 'Tkl+Int', 'Err', 'Def Pen', 'Succ%', '44986', 'onGA', 'xG+/-90', 'CrdY', 'CrdR', 'Won%', 'def', 'mid', 'forw', 'Age^2']
}

lasso_models = {}

for group_name in group_names:
    df = read_data(group_name)
    lasso_model = build_lasso_regression(df, 'RATING', regressors[group_name])
    lasso_models[group_name] = lasso_model

    print(f"\nLasso Regression Results for {group_name} group:")
    coef_df = pd.DataFrame(lasso_model.coef_, index=['const'] + regressors[group_name], columns=['Coefficient'])
    coef_df = coef_df[coef_df['Coefficient'] != 0]  # Filter out zero coefficients
    print(coef_df)



Lasso Regression Results for red group:
       Coefficient
Age^2     0.004797
Gls       0.272159
SoT%      0.005942
Cmp%      0.123940
Ast       0.315364
Tkl%     -0.007367
Clr       0.015809
Att.5     0.024235
Succ%     0.007319
PPM       2.645793
CrdY      0.018055
Won%      0.007446

Lasso Regression Results for yellow group:
         Coefficient
xG          0.337470
Blocks      0.009266
Tkl+Int     0.014493
Err         0.017769
Def Pen     0.009102
Att.5       0.025955
Succ%       0.015606
Mn/Sub     -0.001260
xG+/-90     2.393429
Won%        0.002479
Age^2       0.005079

Lasso Regression Results for green group:
         Coefficient
Sh/90       0.663461
np:G-xG     0.244772
Dist       -0.080479
Cmp%        0.109724
A-xAG       0.114243
Crs         0.011196
TO          0.248788
Tkl%       -0.018398
Tkl+Int     0.016514
Def Pen     0.009428
Succ%       0.001857
44986       0.019125
onGA       -0.031843
xG+/-90     1.527393
Won%        0.004809
Age^2       0.005346


In [9]:
import pandas as pd
from linearmodels import PanelOLS, RandomEffects
from statsmodels.tools.tools import add_constant
from linearmodels.panel import compare

def build_fixed_effects_regression(df, target, features):
    X = add_constant(df[features])
    y = df[target]
    fixed_effects_mod = PanelOLS(y, X, entity_effects=True, time_effects=True, drop_absorbed=True, check_rank=False)
    fixed_effects_res = fixed_effects_mod.fit()
    return fixed_effects_res

def build_random_effects_regression(df, target, features):
    X = add_constant(df[features])
    y = df[target]
    random_effects_mod = RandomEffects(y, X, check_rank=False)
    random_effects_res = random_effects_mod.fit()
    return random_effects_res

group_names = ['red', 'yellow', 'green']
regressors = {
    'red': ['Gls', 'SoT%', 'Cmp%', 'Ast', 'Tkl%', 'Clr', 'Att.5', 'Succ%', 'PPM', 'CrdY', 'CrdR', 'Won%', 'def', 'mid', 'forw'],
    'yellow': ['xG', 'Blocks', 'Tkl+Int', 'Err', 'Def Pen', 'Att.5', 'Succ%', 'Mn/Sub', 'xG+/-90', 'OG', 'Won%', 'def', 'mid', 'forw'],
    'green': ['Sh/90', 'np:G-xG', 'Dist', 'npxG/Sh', 'Cmp%', 'A-xAG', 'Crs', 'TO', 'Tkl%', 'Tkl+Int', 'Err', 'Def Pen', 'Succ%', '44986', 'onGA', 'xG+/-90', 'CrdY', 'CrdR', 'Won%', 'def', 'mid', 'forw']
}

for group_name in group_names:
    df = read_data(group_name)
    
    fixed_effects_res = build_fixed_effects_regression(df, 'RATING', regressors[group_name])
    random_effects_res = build_random_effects_regression(df, 'RATING', regressors[group_name])

    print(f"\nFixed Effects Regression Results for {group_name} group:")
    print(fixed_effects_res)

    print(f"\nRandom Effects Regression Results for {group_name} group:")
    print(random_effects_res)
    
    print(compare({'FE':fixed_effects_res, 'RE' :random_effects_res }))

Variables have been fully absorbed and have removed from the regression:

forw

  fixed_effects_res = fixed_effects_mod.fit()



Fixed Effects Regression Results for red group:
                          PanelOLS Estimation Summary                           
Dep. Variable:                 RATING   R-squared:                        0.2687
Estimator:                   PanelOLS   R-squared (Between):              0.3722
No. Observations:                2464   R-squared (Within):               0.2606
Date:                Sun, Apr 09 2023   R-squared (Overall):              0.3587
Time:                        21:18:54   Log-likelihood                   -4258.9
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      48.047
Entities:                         616   P-value                           0.0000
Avg Obs:                       4.0000   Distribution:                 F(14,1831)
Min Obs:                       4.0000                                           
Max Obs:                       4.0000   F-statistic (robust)


Fixed Effects Regression Results for yellow group:
                          PanelOLS Estimation Summary                           
Dep. Variable:                 RATING   R-squared:                        0.2190
Estimator:                   PanelOLS   R-squared (Between):              0.2759
No. Observations:                2464   R-squared (Within):               0.2106
Date:                Sun, Apr 09 2023   R-squared (Overall):              0.2680
Time:                        21:19:00   Log-likelihood                   -4339.9
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      39.515
Entities:                         616   P-value                           0.0000
Avg Obs:                       4.0000   Distribution:                 F(13,1832)
Min Obs:                       4.0000                                           
Max Obs:                       4.0000   F-statistic (robu


Fixed Effects Regression Results for green group:
                          PanelOLS Estimation Summary                           
Dep. Variable:                 RATING   R-squared:                        0.2209
Estimator:                   PanelOLS   R-squared (Between):              0.2550
No. Observations:                2464   R-squared (Within):               0.2156
Date:                Sun, Apr 09 2023   R-squared (Overall):              0.2502
Time:                        21:19:14   Log-likelihood                   -4336.9
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      24.621
Entities:                         616   P-value                           0.0000
Avg Obs:                       4.0000   Distribution:                 F(21,1824)
Min Obs:                       4.0000                                           
Max Obs:                       4.0000   F-statistic (robus

In [31]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from linearmodels.panel import RandomEffects
from scipy.stats import shapiro


import numpy as np
from scipy import stats

def detect_outliers_z_score(data, threshold=3):
    z_scores = np.abs(stats.zscore(data))
    outliers = np.where(z_scores > threshold)
    return outliers

for group_name in group_names:
    df = read_data(group_name)
    random_effects_res = build_random_effects_regression(df, 'RATING', regressors[group_name])

    print(f"\nRandom Effects Regression Results for {group_name} group:")
    print(random_effects_res)

    residuals = random_effects_res.resids
    outliers = detect_outliers_z_score(residuals)

    print(f"\nOutliers detected for {group_name} group:")
    if len(outliers[0]) > 0:
        print(f"Indices: {outliers[0]}")
    else:
        print("No outliers detected")



Random Effects Regression Results for red group:
                        RandomEffects Estimation Summary                        
Dep. Variable:                 RATING   R-squared:                        0.3602
Estimator:              RandomEffects   R-squared (Between):              0.5292
No. Observations:                2464   R-squared (Within):               0.2003
Date:                Sun, Apr 09 2023   R-squared (Overall):              0.4894
Time:                        23:27:27   Log-likelihood                   -4942.0
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      91.868
Entities:                         616   P-value                           0.0000
Avg Obs:                       4.0000   Distribution:                 F(15,2448)
Min Obs:                       4.0000                                           
Max Obs:                       4.0000   F-statistic (robust

In [32]:
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan

for group_name in group_names:
    df = read_data(group_name)
    random_effects_res = build_random_effects_regression(df, 'RATING', regressors[group_name])

    print(f"\nRandom Effects Regression Results for {group_name} group:")
    print(random_effects_res)

    # Conducting Breusch-Pagan test
    residuals = random_effects_res.resids
    X = df[regressors[group_name]]
    bp_test = het_breuschpagan(residuals, sm.add_constant(X))

    print(f"\nBreusch-Pagan test for {group_name} group:")
    print(f"LM statistic: {bp_test[0]}, p-value: {bp_test[1]}, f-value: {bp_test[2]}, f_p-value: {bp_test[3]}")

    # Checking for heteroskedasticity using a significance level of 0.05
    if bp_test[1] < 0.05:
        print(f"Heteroskedasticity detected for {group_name} group.")
    else:
        print(f"No heteroskedasticity detected for {group_name} group.")



Random Effects Regression Results for red group:
                        RandomEffects Estimation Summary                        
Dep. Variable:                 RATING   R-squared:                        0.3602
Estimator:              RandomEffects   R-squared (Between):              0.5292
No. Observations:                2464   R-squared (Within):               0.2003
Date:                Sun, Apr 09 2023   R-squared (Overall):              0.4894
Time:                        23:30:50   Log-likelihood                   -4942.0
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      91.868
Entities:                         616   P-value                           0.0000
Avg Obs:                       4.0000   Distribution:                 F(15,2448)
Min Obs:                       4.0000                                           
Max Obs:                       4.0000   F-statistic (robust