# [Фінальний проєкт](https://www.edu.goit.global/uk/learn/25315460/23598278/26258114/homework)
___

## Kaggle: [ML: Fundamentals and Applications 2025-03](https://www.kaggle.com/competitions/ml-fundamentals-and-applications-2025-03/leaderboard)

## Імпорт необхідних модулів

In [None]:
import datetime
import os
import warnings
import pandas as pd
import pandas.api.types
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
import matplotlib.gridspec as gridspec
from sklearn.decomposition import PCA

from typing import Sequence, Union, Optional
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import RFE, SelectKBest, f_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, SMOTENC
from sklearn.calibration import LabelEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import (
    RandomForestRegressor,
    RandomForestClassifier,
    GradientBoostingRegressor,
    GradientBoostingClassifier,
    VotingClassifier,
)
from sklearn.preprocessing import (
    StandardScaler, 
    OneHotEncoder, 
    PowerTransformer, 
    RobustScaler
)
from sklearn.metrics import (
    balanced_accuracy_score,
    make_scorer,
    mean_absolute_error,
    mean_squared_error,
    mean_absolute_percentage_error,
    r2_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)

## Kaggle metric utilities

In [None]:
"""
This script exists to reduce code duplication across metrics.
"""

import numpy as np
import pandas as pd
import pandas.api.types

from typing import Union


class ParticipantVisibleError(Exception):
    pass


class HostVisibleError(Exception):
    pass


def treat_as_participant_error(
    error_message: str, solution: Union[pd.DataFrame, np.ndarray]
) -> bool:
    """Many metrics can raise more errors than can be handled manually. This function attempts
    to identify errors that can be treated as ParticipantVisibleError without leaking any competition data.

    If the solution is purely numeric, and there are no numbers in the error message,
    then the error message is sufficiently unlikely to leak usable data and can be shown to participants.

    We expect this filter to reject many safe messages. It's intended only to reduce the number of errors we need to manage manually.
    """
    # This check treats bools as numeric
    if isinstance(solution, pd.DataFrame):
        solution_is_all_numeric = all(
            [pandas.api.types.is_numeric_dtype(x) for x in solution.dtypes.values]
        )
        solution_has_bools = any(
            [pandas.api.types.is_bool_dtype(x) for x in solution.dtypes.values]
        )
    elif isinstance(solution, np.ndarray):
        solution_is_all_numeric = pandas.api.types.is_numeric_dtype(solution)
        solution_has_bools = pandas.api.types.is_bool_dtype(solution)

    if not solution_is_all_numeric:
        return False

    for char in error_message:
        if char.isnumeric():
            return False
    if solution_has_bools:
        if "true" in error_message.lower() or "false" in error_message.lower():
            return False
    return True


def safe_call_score(metric_function, solution, submission, **metric_func_kwargs):
    """
    Call score. If that raises an error and that already been specifically handled, just raise it.
    Otherwise make a conservative attempt to identify potential participant visible errors.
    """
    try:
        score_result = metric_function(solution, submission, **metric_func_kwargs)
    except Exception as err:
        error_message = str(err)
        if err.__class__.__name__ == "ParticipantVisibleError":
            raise ParticipantVisibleError(error_message)
        elif err.__class__.__name__ == "HostVisibleError":
            raise HostVisibleError(error_message)
        else:
            if treat_as_participant_error(error_message, solution):
                raise ParticipantVisibleError(error_message)
            else:
                raise err
    # Explicit float conversion prevents issues with numbers stored as np.float64 scalars
    return float(score_result)


def verify_valid_probabilities(df: pd.DataFrame, df_name: str):
    """Verify that the dataframe contains valid probabilities.

    The dataframe must be limited to the target columns; do not pass in any ID columns.
    """
    if not pandas.api.types.is_numeric_dtype(df.values):
        raise ParticipantVisibleError(f"All target values in {df_name} must be numeric")

    if df.min().min() < 0:
        raise ParticipantVisibleError(
            f"All target values in {df_name} must be at least zero"
        )

    if df.max().max() > 1:
        raise ParticipantVisibleError(
            f"All target values in {df_name} must be no greater than one"
        )

    if not np.allclose(df.sum(axis=1), 1):
        raise ParticipantVisibleError(
            f"Target values in {df_name} do not add to one within all rows"
        )


class ParticipantVisibleError(Exception):
    pass


def score(
    solution: pd.DataFrame,
    submission: pd.DataFrame,
    row_id_column_name: str,
    weights_column_name: Optional[str] = None,
    adjusted: bool = False,
) -> float:
    """
    Wrapper for https://scikit-learn.org/stable/modules/generated/sklearn.metrics.balanced_accuracy_score.html
    Compute the balanced accuracy.

    The balanced accuracy in binary and multiclass classification problems to
    deal with imbalanced datasets. It is defined as the average of recall
    obtained on each class.

    The best value is 1 and the worst value is 0 when ``adjusted=False``.

    Parameters
    ----------
    solution : 1d DataFrame
    Ground truth (correct) target values.

    submission : 1d DataFrame
    Estimated targets as returned by a classifier.

    weights_column_name: optional str, the name of the sample weights column in the solution file.

    adjusted : bool, default=False
    When true, the result is adjusted for chance, so that random
    performance would score 0, while keeping perfect performance at a score
    of 1.

    References
    ----------
    .. [1] Brodersen, K.H.; Ong, C.S.; Stephan, K.E.; Buhmann, J.M. (2010).
    The balanced accuracy and its posterior distribution.
    Proceedings of the 20th International Conference on Pattern
    Recognition, 3121-24.
    .. [2] John. D. Kelleher, Brian Mac Namee, Aoife D'Arcy, (2015).
    `Fundamentals of Machine Learning for Predictive Data Analytics:
    Algorithms, Worked Examples, and Case Studies
    <https://mitpress.mit.edu/books/fundamentals-machine-learning-predictive-data-analytics>`_.

    Examples
    --------

    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_true = [0, 1, 0, 0, 1, 0]
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true["id"] = range(len(y_true))
    >>> y_pred = [0, 1, 0, 0, 0, 1]
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred["id"] = range(len(y_pred))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.625
    """
    # Skip sorting and equality checks for the row_id_column since that should already be handled
    del solution[row_id_column_name]
    del submission[row_id_column_name]

    sample_weight = None
    if weights_column_name:
        if weights_column_name not in solution.columns:
            raise ValueError(
                f"The solution weights column {weights_column_name} is not found"
            )
        sample_weight = solution.pop(weights_column_name).values
        if not pandas.api.types.is_numeric_dtype(sample_weight):
            raise ParticipantVisibleError("The solution weights are not numeric")

    if len(submission.columns) > 1:
        raise ParticipantVisibleError(
            f"The submission can only include one column of predictions. Found {len(submission.columns)}"
        )

    solution = solution.values
    submission = submission.values

    score_result = safe_call_score(
        balanced_accuracy_score,
        solution,
        submission,
        sample_weight=sample_weight,
        adjusted=adjusted,
    )

    return score_result

## 1. Завантаження даних

In [None]:
PATH_TO_DATASETS = os.path.expanduser(
    "~/Projects/GoIT/MACHINE-LEARNING-NEO/datasets/final/data"
)
PATH_TO_RESULTS = os.path.expanduser(
    "~/Projects/GoIT/MACHINE-LEARNING-NEO/datasets/final/results"
)
os.makedirs(PATH_TO_RESULTS, exist_ok=True)
print(PATH_TO_DATASETS)
print(PATH_TO_RESULTS)

train_file = os.path.join(PATH_TO_DATASETS, "final_proj_data.csv")
test_file = os.path.join(PATH_TO_DATASETS, "final_proj_test.csv")

In [None]:
# "customer churn"
TARGET = "y"
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

print(f"Training data shape: {train_df.shape}")
display(train_df.head())
print(f"Test data shape: {test_df.shape}")
display(test_df.head())

## 2. EDA

### Перевірка типів даних і відсутніх значень

In [None]:
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

print("\nData types in training set:")
print(train_df.dtypes.value_counts())

# Analyze missing values
missing_summary = train_df.isnull().sum()
missing_percent = (missing_summary / len(train_df)) * 100
missing_data = pd.DataFrame({'Missing Count': missing_summary, 
                            'Missing Percent': missing_percent})
missing_data = missing_data[missing_data['Missing Count'] > 0].sort_values('Missing Percent', ascending=False)

print("\nFeatures with missing values:")
print(missing_data)

sns.heatmap(train_df.isna(), cbar=False, cmap="viridis")

#### Видаляємо пусті колонки

In [None]:
columns_to_drop = missing_summary[missing_summary == len(train_df)].index.tolist()
if columns_to_drop:
    print(f"Removing {len(columns_to_drop)} completely empty columns: {columns_to_drop}")
    train_df.drop(columns=columns_to_drop, inplace=True)
    test_df.drop(columns=columns_to_drop, errors="ignore", inplace=True)

sns.heatmap(train_df.isna(), cbar=False, cmap="viridis")

### Видаляємо пусті рядки

In [None]:
empty_rows_train = train_df.isna().all(axis=1).sum()
print(f"Number of completely empty rows in training data: {empty_rows_train} out of {len(train_df)} ({empty_rows_train/len(train_df)*100:.2f}%)")

empty_rows_test = test_df.isna().all(axis=1).sum()
print(f"Number of completely empty rows in test data: {empty_rows_test} out of {len(test_df)} ({empty_rows_test/len(test_df)*100:.2f}%)")

train_df_cleaned = train_df.dropna(how='all')
test_df_cleaned = test_df.dropna(how='all')

print(f"Removed {len(train_df) - len(train_df_cleaned)} rows from training data")
print(f"Removed {len(test_df) - len(test_df_cleaned)} rows from test data")

train_df = train_df_cleaned
test_df = test_df_cleaned

print(f"New training data shape: {train_df.shape}")
print(f"New test data shape: {test_df.shape}")

### Розподіл

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train_df[TARGET], kde=True)
plt.title("Distribution  Target Variable (customer churn)")
plt.xlabel("customer churn")
target_counts = train_df[TARGET].value_counts()
ax = sns.countplot(x=TARGET, data=train_df)
for i, count in enumerate(target_counts):
    percentage = 100 * count / len(train_df)
    ax.text(i, count + 5, f"{percentage:.1f}%", ha='center')
plt.show()

class_imbalance = train_df[TARGET].value_counts().max() / train_df[TARGET].value_counts().min()
print(f"\nClass imbalance ratio (majority:minority): {class_imbalance:.2f}")

## Feature Analysis

In [None]:
X = train_df.drop(columns=[TARGET], errors="ignore")
y = train_df[TARGET]
print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include="object").columns.tolist()

print(f"\nNumeric features: {len(num_features)}")
print(f"Categorical features: {len(cat_features)}")

### Categorical features

In [None]:
if (cat_features):
    plt.figure(figsize=(15, 10))
    gs = gridspec.GridSpec(len(cat_features) // 2 + 1, 2)

    for i, feature in enumerate(cat_features):
        ax = plt.subplot(gs[i // 2, i % 2])

        unique_count = train_df[feature].dropna().nunique()
        value_counts = train_df[feature].value_counts(dropna=False).head(10)
        value_counts.plot(kind='bar', ax=ax)
        ax.set_title(f"{feature} (unique: {unique_count})")
        ax.tick_params(axis='x', rotation=45)
        plt.tight_layout()

    plt.show()

print(f"Unique values in categorical columns:")
for col in train_df.select_dtypes(include=["object"]).columns:
    print(f"{col}: {train_df[col].nunique()} unique values")

    unique_values = train_df[col].unique()
    if len(unique_values) <= 10:
        print(f"    Values: {unique_values}")
    else:
        print(f"    Sample values: {unique_values[:5]} ...")

### Numeric features

In [None]:
plt.figure(figsize=(15, 10))
gs = gridspec.GridSpec(min(10, len(num_features)) // 2 + 1, 2)

for i, feature in enumerate(num_features[:10]):  # Limit to first 10 features for visibility
    ax = plt.subplot(gs[i // 2, i % 2])
    sns.histplot(train_df[feature].dropna(), kde=True, ax=ax)
    ax.set_title(f"Distribution of {feature}")
    plt.tight_layout()

plt.show()

# Correlation matrix
plt.figure(figsize=(14, 10))
numeric_data = train_df[num_features].copy()
corr_matrix = numeric_data.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, cmap='coolwarm', annot=False, center=0)
plt.title("Correlation Matrix of Numeric Features")
plt.tight_layout()
plt.show()

high_corr_threshold = 0.8
high_corr_features = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > high_corr_threshold:
            feat_pair = (corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j])
            high_corr_features.append(feat_pair)
            high_corr_features.sort(key=lambda x: abs(x[2]), reverse=True)

if high_corr_features:
    top_n = min(3, len(high_corr_features))
    fig, axes = plt.subplots(1, top_n, figsize=(15, 5))
    if top_n == 1:
        axes = [axes]
    
    for i in range(top_n):
        feat1, feat2, corr = high_corr_features[i]
        sns.scatterplot(x=train_df[feat1], y=train_df[feat2], hue=train_df[TARGET], ax=axes[i])
        axes[i].set_title(f"{feat1} vs {feat2}\nCorr: {corr:.2f}")
        axes[i].set_xlabel(feat1)
        axes[i].set_ylabel(feat2)
    
    plt.tight_layout()
    plt.show()

    print("\nHighly correlated feature pairs:")
    for feat1, feat2, corr in high_corr_features:
        print(f"{feat1} and {feat2}: {corr:.2f}")


In [None]:
if high_corr_features:
    print(f"\nRemoving {len(high_corr_features)} highly correlated features:")
    for feat1, feat2, corr in high_corr_features:
        # if corr == 1:
            # Check if the feature exists before trying to drop it
        if feat2 in train_df.columns:
            print(f"Removing {feat2} due to perfect correlation with {feat1}")
            train_df.drop(columns=feat2, errors="ignore", inplace=True)
            test_df.drop(columns=feat2, errors="ignore", inplace=True)
else:
    print("\nNo highly correlated features to remove.")

   
print(f"New training data shape: {train_df.shape}")
print(f"New test data shape: {test_df.shape}")

## Важливість ознак

### RandomForestClassifier

In [None]:
X_sample = X.copy()

for col in num_features:
    X_sample[col] = X_sample[col].fillna(X_sample[col].median())
for col in cat_features:
    X_sample[col] = X_sample[col].fillna(X_sample[col].mode()[0] if not X_sample[col].mode().empty else "Unknown")

X_encoded = pd.get_dummies(X_sample, columns=cat_features, drop_first=True)

print("\nRunning preliminary feature importance analysis...")
rf_prelim = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_prelim.fit(X_encoded, y)

feature_importance = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Importance': rf_prelim.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(20))
plt.title('Top 20 Features by Importance (Preliminary - Random Forest)')
plt.tight_layout()
plt.show()

### PCA

In [None]:
X_scaled = StandardScaler().fit_transform(X_encoded)
    
pca = PCA(n_components=min(20, X_scaled.shape[1]))
pca.fit(X_scaled)

loadings = pca.components_.T
explained_variance_ratio = pca.explained_variance_ratio_

pca_importance = np.zeros(loadings.shape[0])
for i in range(len(explained_variance_ratio)):
    pca_importance += np.abs(loadings[:, i]) * explained_variance_ratio[i]

pca_importance = pca_importance / np.sum(pca_importance)

pca_feature_importance = pd.DataFrame({
    'Feature': X_encoded.columns,
    'PCA_Importance': pca_importance
}).sort_values('PCA_Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='PCA_Importance', y='Feature', data=pca_feature_importance.head(20))
plt.title('Top 20 Features by Importance (PCA-based Analysis)')
plt.tight_layout()
plt.show()


In [None]:
rf_top_features = feature_importance.head(20)['Feature'].tolist()
pca_top_features = pca_feature_importance.head(20)['Feature'].tolist()

common_features = set(rf_top_features).intersection(set(pca_top_features))

print(f"\nNumber of common features in top 20 between RF and PCA: {len(common_features)}")
print("Common important features:")
for feature in common_features:
    rf_rank = rf_top_features.index(feature) + 1
    pca_rank = pca_top_features.index(feature) + 1
    print(f"  - {feature} (RF rank: {rf_rank}, PCA rank: {pca_rank})")

feature_importance['RF_Importance_Normalized'] = feature_importance['Importance'] / feature_importance['Importance'].max()
pca_feature_importance['PCA_Importance_Normalized'] = pca_feature_importance['PCA_Importance'] / pca_feature_importance['PCA_Importance'].max()

combined_importance = pd.merge(
    feature_importance[['Feature', 'RF_Importance_Normalized']], 
    pca_feature_importance[['Feature', 'PCA_Importance_Normalized']], 
    on='Feature'
)

combined_importance['Combined_Score'] = (combined_importance['RF_Importance_Normalized'] + 
                                        combined_importance['PCA_Importance_Normalized']) / 2

combined_importance = combined_importance.sort_values('Combined_Score', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Combined_Score', y='Feature', data=combined_importance.head(20))
plt.title('Top 20 Features by Combined Importance (RF + PCA)')
plt.tight_layout()
plt.show()


## ML pipeline setup

In [None]:
print(f"New training data shape: {train_df.shape}")
print(f"New test data shape: {test_df.shape}")

In [None]:
X = train_df.drop(columns=[TARGET], errors="ignore")
y = train_df[TARGET]

num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include="object").columns.tolist()

print(f"\nNumeric features: {len(num_features)}")
print(f"Categorical features: {len(cat_features)}")

for col in X.columns:
    if X[col].dtype in ["int64", "float64"]:
        if X[col].isnull().sum() > 0:
            imputer = KNNImputer(n_neighbors=5)
            X[col] = imputer.fit_transform(X[col].values.reshape(-1, 1)).flatten()
    else:
        X[col].fillna("Unknown", inplace=True)
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

In [None]:
numeric_transformer = Pipeline(
    steps=[
            ("imputer", KNNImputer(n_neighbors=7)),
            ("scaler", RobustScaler())
        ]
    )

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False,
                                    max_categories=20)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features),
    ],
    remainder='drop'
)

# logreg_pipeline = ImbPipeline(
#     steps=[
#         ("preprocessor", preprocessor),
#         ("feature_selection", SelectKBest(f_classif, k=30)),
#         ("sampling", SMOTE(random_state=42, sampling_strategy=0.5)),
#         (
#             "classifier",
#             LogisticRegression(
#                 max_iter=2000,
#                 class_weight="balanced",
#                 solver="liblinear",
#                 C=0.5,
#                 penalty="l1"
#             ),
#         ),
#     ]
# )

# rf_pipeline = ImbPipeline(
#     steps=[
#         ("preprocessor", preprocessor),
#         ("sampling", SMOTE(random_state=42, sampling_strategy=0.4)),
#         (
#             "classifier",
#             RandomForestClassifier(
#                 class_weight={0: 1, 1: 6},
#                 random_state=42,
#                 n_estimators=200,
#                 max_depth=12,
#                 min_samples_split=3,
#                 min_samples_leaf=2,
#                 bootstrap=True,
#                 max_features='sqrt'
#             ),
#         ),
#     ]
# )

# gb_pipeline = ImbPipeline(
#     steps=[
#         ("preprocessor", preprocessor),
#         ("sampling", SMOTE(random_state=42, sampling_strategy=0.4)),
#         (
#             "classifier",
#             GradientBoostingClassifier(
#                 random_state=42,
#                 n_estimators=200,
#                 learning_rate=0.05,
#                 max_depth=4,
#                 subsample=0.8,
#                 min_samples_split=5,
#                 min_samples_leaf=5
#             ),
#         ),
#     ]
# )

# voting_pipeline = ImbPipeline(
#     steps=[
#         ("preprocessor", preprocessor),
#         ("sampling", SMOTE(random_state=42, sampling_strategy=0.4)),
#         (
#             "classifier",
#             VotingClassifier(
#                 estimators=[
#                     ('lr', LogisticRegression(max_iter=2000, class_weight="balanced", solver="liblinear", C=0.5, penalty="l1")),
#                     ('rf', RandomForestClassifier(class_weight={0: 1, 1: 6}, random_state=42, n_estimators=200, max_depth=12)),
#                     ('gb', GradientBoostingClassifier(random_state=42, n_estimators=200, learning_rate=0.05, max_depth=4))
#                 ],
#                 voting='soft'
#             )
#         ),
#     ]
# )

# xgb_pipeline = ImbPipeline(
#     steps=[
#         ("preprocessor", preprocessor),
#         ("sampling", SMOTE(random_state=42, sampling_strategy=0.4)),
#         (
#             "classifier",
#             XGBClassifier(
#                 n_estimators=200,
#                 learning_rate=0.05,
#                 max_depth=4,
#                 use_label_encoder=False,
#                 eval_metric="logloss",
#                 scale_pos_weight=5,
#                 random_state=42,
#             ),
#         ),
#     ]
# )
cat_indices = [i for i, col in enumerate(X.columns) if col in cat_features]

xgb_pipeline = ImbPipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "sampling",
            SMOTENC(
                categorical_features=cat_indices, random_state=42, sampling_strategy=0.4
            ),
        ),
        (
            "classifier",
            XGBClassifier(
                n_estimators=300,
                learning_rate=0.03,
                max_depth=5,
                subsample=0.9,
                colsample_bytree=0.8,
                min_child_weight=3,
                gamma=0.5,
                scale_pos_weight=6.5,
                use_label_encoder=False,
                eval_metric="logloss",
                random_state=42,
            ),
        ),
    ]
)

In [None]:
param_grid = {
    "classifier__max_depth": [4, 5, 6],
    "classifier__learning_rate": [0.02, 0.03, 0.05],
    "classifier__scale_pos_weight": [6, 6.5, 7],
}

grid = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring="balanced_accuracy",
    n_jobs=-1,
    verbose=2,
)

print("🔍 Пошук найкращої моделі через GridSearchCV...")
grid.fit(X, y)

print("\n✅ Best parameters found:")
print(grid.best_params_)
print(f"🏆 Best balanced accuracy: {grid.best_score_:.4f}")
best_model = grid.best_estimator_

## Cross-validation

In [None]:
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# model_pipelines = {
#     # "Logistic Regression": logreg_pipeline,
#     # "Random Forest": rf_pipeline,
#     # "Gradient Boosting": gb_pipeline,
#     # "Voting Ensemble": voting_pipeline,
#     "XGBoostClassifier": xgb_pipeline,
# }

# results = {}

# for name, pipeline in model_pipelines.items():
#     print(f"\nEvaluating {name}...")
#     scores = []
    
#     for train_idx, val_idx in cv.split(X, y):
#         X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
#         y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        
#         pipeline.fit(X_train_fold, y_train_fold)
        
#         y_pred_fold = pipeline.predict(X_val_fold)
        
#         solution_df = pd.DataFrame({
#             'row_id': range(len(y_val_fold)),  
#             'y': y_val_fold.values
#         })
        
#         submission_df = pd.DataFrame({
#             'row_id': range(len(y_pred_fold)),  
#             'prediction': y_pred_fold
#         })
        
#         fold_score = score(
#             solution=solution_df.copy(),
#             submission=submission_df.copy(),
#             row_id_column_name='row_id'
#         )
        
#         scores.append(fold_score)
    
#     results[name] = {
#         "scores": np.array(scores),
#         "mean": np.mean(scores),
#         "std": np.std(scores)
#     }
    
#     print(f"{name} CV Scores: {scores}")
#     print(f"{name} Mean Balanced Accuracy: {results[name]['mean']:.4f} ± {results[name]['std']:.4f}")

# best_model_name = max(results, key=lambda k: results[k]['mean'])
# best_model = model_pipelines[best_model_name]

In [None]:
print(f"\nBest model: {best_model_name} with balanced accuracy: {results[best_model_name]['mean']:.4f}")

In [None]:
print(f"\nTraining {best_model} on full training dataset...")
best_model.fit(X, y)

if "row_id" not in test_df.columns:
    test_df["row_id"] = range(len(test_df))

print("Generating predictions on test data...")
test_predictions = best_model.predict(test_df.drop(columns=["row_id"], errors="ignore"))

submission_df = pd.DataFrame(
    {"index": range(len(test_predictions)), TARGET: test_predictions}
)

submission_path = os.path.join(PATH_TO_RESULTS, "submission.csv")
submission_df.to_csv(submission_path, index=False, header=False)
print(f"Submission saved to {submission_path}")

unique_predictions, counts = np.unique(test_predictions, return_counts=True)
print("\nPrediction distribution on test data:")
for value, count in zip(unique_predictions, counts):
    print(f"  Class {value}: {count} ({count/len(test_predictions)*100:.2f}%)")

print(f"\nSubmission file contains {len(submission_df)} predictions")

In [None]:
# print(f"Test data shape: {test_df.shape}")

# if 'row_id' not in test_df.columns:
#     test_df['row_id'] = range(len(test_df))

# print(f"\nTraining {best_model_name} on full training dataset...")
# best_model.fit(X, y)

# print("Generating predictions on test data...")
# test_predictions = best_model.predict(test_df.drop(columns=['row_id'], errors='ignore'))

# submission_df = pd.DataFrame({
#     'index': range(len(test_predictions)),
#     'y': test_predictions
# })

# submission_path = os.path.join(PATH_TO_RESULTS, "submission.csv")
# submission_df.to_csv(submission_path, index=False, header=False)
# print(f"Submission saved to {submission_path}")

# solution_df = pd.DataFrame({
#     'row_id': range(len(test_df)),
#     'y': np.zeros(len(test_df))
# })

# prediction_df = pd.DataFrame({
#     'row_id': range(len(test_predictions)),
#     'prediction': test_predictions
# })

# unique_predictions, counts = np.unique(test_predictions, return_counts=True)
# print("\nPrediction distribution on test data:")
# for value, count in zip(unique_predictions, counts):
#     print(f"  Class {value}: {count} ({count/len(test_predictions)*100:.2f}%)")

# print(f"\nSubmission file contains {len(submission_df)} predictions")