# 0. Imports

In [None]:
#All the libraries and modules required
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import shap
import pymc as pm
import arviz as az

from datetime import datetime

from scipy.stats import f_oneway
from scipy.cluster.hierarchy import linkage, dendrogram
from category_encoders import TargetEncoder

from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, FunctionTransformer, RobustScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, GridSearchCV, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, TweedieRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_squared_error, mean_absolute_error, r2_score
from feature_engine.outliers import OutlierTrimmer
from sklearn.feature_selection import SelectKBest, f_regression, RFECV, SequentialFeatureSelector
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, Isomap, TSNE
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

In [None]:
pd.set_option("display.max_rows", None)   
pd.set_option("display.max_columns", None)   
pd.set_option("display.width", 0)            
pd.set_option("display.max_colwidth", None)

# 1. Preprocessing

## 1.1 Basic EDA

In [None]:
df = pd.read_csv("""Data file name""")

In [None]:
features = pd.DataFrame(columns=['Features','Number of unique values','Number of nulls'])

for i, feat in enumerate(df.columns):
    features.loc[i] = [feat, df[feat].nunique(), df[feat].isnull().sum()]

features

In [None]:
num_ft = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_ft = df.select_dtypes(include=["object", "bool"]).columns.tolist()
print(f"\n Numerical ft: {num_ft} \n Total = {len(num_ft)}")
print(f"\n Categorical ft: {cat_ft} \n Total = {len(cat_ft)}")

In [None]:
correlations = df[num_ft].corr()
print("Correlations with Target:\n")
print(correlations["""target name"""].sort_values(ascending=False))

## 1.2 Pre processing

In [None]:
#Remove columns based on the correlation stuff (the three columns with NaN corr)
# df = df.drop(['account_id', 'competition_id', 'player_match_360_minutes', 'player_match_claim_success'], axis=1)

In [None]:
num_ft = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_ft = df.select_dtypes(include=["object", "bool"]).columns.tolist()

### 1.2.1 Null values

In [None]:
num_imputer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
])

cat_imputer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

imputer = ColumnTransformer([
    ('num', num_imputer, num_ft),
    ('cat', cat_imputer, cat_ft)
])

df_transformed = imputer.fit_transform(df)
df_imputed = pd.DataFrame(df_transformed, index = df.index, columns=num_ft + cat_ft)
df_imputed[num_ft] = df_imputed[num_ft].apply(pd.to_numeric)

In [None]:
#Maybe do a trimmer for outliers but idt i need one
# trimmer = OutlierTrimmer(capping_method='quantiles', tail='both', fold=0.05)

# df_num_trimmed = trimmer.fit_transform(df_imputed[num_ft])
# trimmed_index = df_num_trimmed.index

# df_trimmed = df_imputed.loc[trimmed_index]


In [None]:
features = pd.DataFrame(columns=['Features','Number of unique values','Number of nulls'])

for i, feat in enumerate(df.columns):
    features.loc[i] = [feat, df_imputed[feat].nunique(), df_imputed[feat].isnull().sum()]

features

### 1.2.2 Split Data

In [None]:
target = """target name"""
X, y = df_imputed.drop(columns=[target]), df_imputed[target]

In [None]:
X_train, X_test, y_train, y_test = (
    train_test_split(X, y, test_size=0.2, random_state=0)
)

In [None]:
num_features = [col for col in num_ft if col != """target name"""]

num_transform = Pipeline([
    ('scaler', StandardScaler())
])

cat_transform = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore',sparse_output=False, min_frequency=0.05))
])

preprocessor = ColumnTransformer([
    ('num', num_transform, num_features),
    ('cat', cat_transform, cat_ft)
])

In [None]:
X_train_scaled = preprocessor.fit_transform(X_train)

In [None]:
X_test_scaled = preprocessor.transform(X_test)

# 2. Automated Feature Selection

## 2.1 SelectKBest

In [None]:
# k_val = range(10, 104, 10)
# kbest_results = []

# for k in k_val:
#     selector = SelectKBest(score_func=f_regression, k=k)
#     X_kbest = selector.fit_transform(X_train_scaled, y_train)

#     model = RandomForestRegressor(random_state=1)
#     scores = cross_val_score(model, X_kbest, y_train, cv=5, scoring='r2')

#     kbest_results.append((k, scores.mean()))

#     selected_ft = X_train.columns[selector.get_support()]
#     print(f"\nSelectKBest k={k}")
#     print("Selected features:", selected_ft.tolist())
#     print("Score avg results:")
#     print (scores.mean(), scores.std())


In [None]:
# df_kbest = pd.DataFrame(kbest_results, columns=["k", "r2"])
# df_kbest.plot(x="k", y="r2", kind="line", marker="o", title="SelectKBest Performance")
# plt.show()

## 2.2 RFECV
Taking too long will run later

In [None]:
# rf = RandomForestRegressor(random_state=2)
# cv = KFold(n_splits=5, shuffle=True, random_state=2)

# rfecv = RFECV(
#     estimator=rf,
#     step=1,
#     cv=cv,
#     scoring='r2'
# )

# rfecv.fit(X_train_scaled, y_train)

# selected_features_rfecv = X_train.columns[rfecv.support_]
# print("Selected features (RFECV with R²):", selected_features_rfecv.tolist())
# print(f"Optimal number of features: {rfecv.n_features_}")


In [None]:
# plt.figure(figsize=(10, 5))
# plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, marker='o')
# plt.xlabel("Number of Features Selected")
# plt.ylabel("Cross-Validated R² Score")
# plt.title("RFECV (Random Forest, R² Scoring)")
# plt.grid(True)
# plt.tight_layout()
# plt.show()

## 2.3 SFS

In [None]:
# sfs_results = []
# directions = ['forward', 'backward']
# n_feat = range(20, 104, 20)

# cv = KFold(n_splits=5, shuffle=True, random_state=3)

# for d in directions: 
#     for n in n_feat:
#         model = RandomForestRegressor(random_state=3)

#         sfs = SequentialFeatureSelector(
#             estimator=model,
#             n_features_to_select=n,
#             direction=d,
#         )
        
#         sfs.fit(X_train_scaled, y_train)
#         selected_feats = X_train.columns[sfs.get_support()]
        
#         X_selected = X_train_scaled[selected_feats]
#         scores = cross_val_score(model, X_selected, y_train, cv=cv, scoring='r2')
#         mean_score = scores.mean()
#         std_score = scores.std()
        
#         sfs_results.append({
#             'Direction': d,
#             'Num Features': n,
#             'Mean R2 Score': mean_score,
#             'Std Dev': std_score,
#             'Features': selected_feats.tolist()
#         })
        
# sfs_df = pd.DataFrame(sfs_results)
# print(sfs_df.sort_values(by='Mean R2 Score', ascending=False))
        

## 2.4 Evaluate

In [None]:
# #Make a new model for each with best params

# #SelectKBest
# k_selector = SelectKBest(score_func=f_regression, k=30) #CHANGE
# X_train_kbest = k_selector.fit_transform(X_train_scaled, y_train)
# X_test_kbest = k_selector.transform(X_test_scaled)
# selected_kbest_features = X_train.columns[k_selector.get_support()]

# model_kbest = RandomForestRegressor(random_state=42)
# model_kbest.fit(X_train_kbest, y_train)

# #RFECV
# rfe_selector = RFECV(estimator=RandomForestRegressor(random_state=42), step=1, cv=5, scoring='r2')
# rfe_selector.fit(X_train_scaled, y_train)
# selected_rfecv_features = X_train.columns[rfe_selector.support_]

# X_train_rfe = X_train_scaled[selected_rfecv_features]
# X_test_rfe = X_test_scaled[selected_rfecv_features]

# model_rfe = RandomForestRegressor(random_state=42)
# model_rfe.fit(X_train_rfe, y_train)

# #SFS
# sfs_selector = SequentialFeatureSelector(
#     estimator=RandomForestRegressor(random_state=42),
#     n_features_to_select=25, #Change
#     direction='forward', #Change
# )
# sfs_selector.fit(X_train_scaled, y_train)
# selected_sfs_features = X_train.columns[sfs_selector.get_support()]

# X_train_sfs = X_train_scaled[selected_sfs_features]
# X_test_sfs = X_test_scaled[selected_sfs_features]

# model_sfs = RandomForestRegressor(random_state=42)
# model_sfs.fit(X_train_sfs, y_train)



In [None]:
# for name, model in [('SelectKBest', model_kbest), ('RFE', model_rfe), ('SFS', model_sfs)]:
#     preds = model.predict(X_test_scaled)
#     mae = mean_absolute_error(y_test, preds)
#     r2 = r2_score(y_test, preds)
#     print(f"{name} MAE: {mae:.2f}, R^2: {r2:.3f}")

# 3. Building Model

## 3.1 Selecting Features

In [None]:
# selector = SelectKBest(score_func=f_regression, k=90)
# X_train_selected = selector.fit_transform(X_train_scaled, y_train)
# X_test_selected = selector.transform(X_test_scaled)

X_train_selected = X_train_scaled

## 3.2 Grid Search

In [None]:
# # param_grid = {
# #     'n_estimators': [100, 300],
# #     'max_depth': [None, 10, 20],
# #     'max_features': ['sqrt', 0.5],
# #     'min_samples_leaf': [1, 3, 5]
# # }

# rfr = RandomForestRegressor(max_depth = 20, max_features = 0.5, min_samples_leaf = 5, n_estimators = 300, random_state=0)

# # grid_search = GridSearchCV(
# #     estimator=rfr,
# #     param_grid=param_grid,
# #     cv=5,
# #     scoring='r2'
# # )

# # grid_search.fit(X_train_selected, y_train)


In [None]:
# print("Best Parameters:", grid_search.best_params_)
# print("Best CV R2", grid_search.best_score_)

# best_model = grid_search.best_estimator_

In [None]:

rfr = RandomForestRegressor(max_depth = 20, max_features = 0.5, min_samples_leaf = 5, n_estimators = 300, random_state=0)



In [None]:
scores = cross_val_score(rfr, X_train_selected, y_train, cv=5, scoring='r2')

In [None]:
print("cv r2 scores", scores)
print("mean r2", scores.mean())

In [None]:
rfr.fit(X_train_selected, y_train)

# 4. Model Interpretation

In [None]:
# selected_ft = X_train.columns[selector.get_support()]

selected_ft = X_train.columns

## 4.1 Permutation Importance

In [None]:
result = permutation_importance(rfr, X_train_selected, y_train, n_repeats=10, random_state=0)

In [None]:
importances = pd.Series(result.importances_mean, index=selected_ft)
importances_sorted = importances.sort_values(ascending=True)

In [None]:
plt.figure(figsize=(10, 12))
importances_sorted.plot(kind='barh')
plt.xlabel('Permutation Importance')
plt.tight_layout()
plt.show()

## 4.2 SHAP

In [None]:
explainer = shap.Explainer(rfr, X_train_selected)

In [None]:
shap_values = explainer(X_train_selected, check_additivity=False)

In [None]:
shap.summary_plot(shap_values, features=X_train_selected, feature_names=selected_ft)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.waterfall(shap_values[0])

In [None]:
# incase of erro
explainer = shap.TreeExplainer(rfr)
shap_values = explainer.shap_values(X_train_selected)
shap.summary_plot(shap_values, X_train_selected, feature_names=selected_ft)

## 4.3 PDP

In [None]:
top_features = importances.sort_values(ascending=False).head(3).index.tolist()

In [None]:
top_features = importances.sort_values(ascending=False).head(5).index.tolist()

In [None]:
PartialDependenceDisplay.from_estimator(rfr, X_train_selected, features=top_features,
                                        feature_names=selected_ft, kind='average',
                                        grid_resolution=50)
plt.tight_layout()
plt.show()

# 5. Dimensionality Reduction

## 5.1 PCA

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train_selected)

In [None]:
print(f"Explained variance: {pca.explained_variance_ratio_}")
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_train, cmap='viridis', s=10)
plt.title("PCA Projection - linear 2d")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar(label="""target name""")
plt.tight_layout()
plt.show()

## 5.2 t-sne
ill try isomap and mds later

In [None]:
tsne = TSNE(n_components=2, random_state=0, perplexity=30, n_iter=1000)
X_tsne = tsne.fit_transform(X_train_selected)

In [None]:
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_train, cmap='coolwarm', s=10)
plt.title("t-SNE Projection (2D)")
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.colorbar(label="""target name""")
plt.tight_layout()
plt.show()

# 6. Clustering

## 6.1 K-means

In [None]:
kmeans = KMeans(n_clusters=4, random_state=0)
clusters = kmeans.fit_predict(X_train_selected)

In [None]:
#visualise with pca: 
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='Set2', s=10)
plt.title("KMeans Clusters on PCA Projection")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.show()

# 7. Final Model 

# 8. Model Evaluation