# Import

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import f_oneway

In [None]:
pd.set_option("display.max_rows", None)   
pd.set_option("display.max_columns", None)   
pd.set_option("display.width", 0)            
pd.set_option("display.max_colwidth", None)

# EDA Function

In [None]:
def eda(df, name):
    target = 'player_match_minutes'
    
    print(f"\n EDA for {name}")
    
    print(f"Shape: {df.shape}")
    
    print("\n Data Types: ")
    print(df.dtypes.value_counts())
    
    print("\n Head: ")
    print(df.head())
    
    print("\n Info: ")
    print(df.info())
    
    print("\n Describe: ")
    print(df.describe())
    
    print("\n Features Unique & Missing Val: ")
    features = pd.DataFrame(columns=['Features','Number of unique values','Number of nulls'])
    for i, feat in enumerate(df.columns):
        features.loc[i] = [feat, df[feat].nunique(), df[feat].isnull().sum()]
    print(features)
    
    print("\n Num Ft vs Cat Ft: ")
    num_ft = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
    cat_ft = df.select_dtypes(include=["object", "bool"]).columns.tolist()
    print(f"\n Numerical ft: {num_ft} \n Total = {len(num_ft)}")
    print(f"\n Categorical ft: {cat_ft} \n Total = {len(cat_ft)}") #check no more cat

    # print("\n Cat ft: ")
    # for col in cat_ft:
    #     unique_values = df[col].nunique()
    #     print(f"{col}: {unique_values} unique values")
    
    #boxplot of target
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[target])
    plt.title(f"Boxplot of {target}")
    plt.show()   
    
    #boxplot of target without outliers
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[target], showfliers=False)
    plt.title(f"Boxplot of {target} (Outliers Hidden)")
    plt.show()
    
    #Hist of numerical features
    fig, axs = plt.subplots(nrows=(len(num_ft) // 3 + 1), ncols=3, figsize=(15, 5 * ((len(num_ft) // 3) + 1)))
    axs = axs.flatten()
    for i, col in enumerate(num_ft):
        sns.histplot(df[col], kde=True, bins=30, ax=axs[i])
        axs[i].set_title(f"Distribution of {col}")
    for j in range(i+1, len(axs)):
        fig.delaxes(axs[j])
    plt.tight_layout()
    plt.show()
    
    #Correlation analysis
    corr = df[num_ft].corr()
    if target in corr.columns:
        corr = df[num_ft].corr()
        target_corr = corr[target].drop(target).sort_values(ascending = False)
        
        print(f"\nCorrelations with target ({target}):")
        print(target_corr)
    
        top_corr_feats = target_corr.abs().sort_values(ascending=False).head(15).index.tolist()
        corr_subset = df[top_corr_feats + [target]].corr()
        
        plt.figure()
        sns.heatmap(corr_subset, annot=False, cmap="coolwarm")
        plt.title(f"Top 15 Feature")
        plt.tight_layout()
        plt.show()
        

    
    ##Anova - if there are cat
    # print("\nANOVA Tests:")
    # for col in cat_ft:
    #     groups = [df[target][df[col] == category] for category in df[col].unique()]
    #     anova_result = f_oneway(*groups)
    #     print(f"ANOVA result for {col}: F-statistic = {anova_result.statistic:.2f}, p-value = {anova_result.pvalue:.3f}")
    

In [None]:
data = pd.read_excel("""data files""")


In [None]:
eda(data, "data")