In [1]:
# !pip3 install -r requirements.txt

In [17]:
import pandas as pd
import numpy as np

import pickle
import json
from tqdm import tqdm

from scipy.stats import ks_2samp

from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings("ignore")

In [4]:
with open('config.json', 'r') as f:
    config_data = json.load(f)

PROCESSING_INFO = config_data['PROCESSING_INFO']
MODEL_INFO = config_data['MODEL_INFO']
TRAIN_INFO = config_data['TRAIN_INFO']

In [5]:
PATHS = [
    "01-14.04.2023.csv",
    "01-07.11.2023.csv",
    "03-09.09.2023.csv",
    "12-18.12.2023.csv"
]

DATA = [pd.read_csv(PROCESSING_INFO["processed_folder"] + path) for path in PATHS]

MODEL_INFO["models"] = {
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [7]:
def prepare_data(data, train_info, shuffle=True):
    """Preprocesses the input data.

    Args:
        data (list): List of pandas DataFrames to be concatenated.
        train_info (dict): Information about the data setup.
        shuffle (bool, optional): Whether to shuffle the data. Defaults to True.

    Returns:
        pandas.DataFrame: Preprocessed dataset.
    """
    df = pd.concat([df for df in data])
    ad_sets = df[train_info["groups"]].unique()
    np.random.seed(42)
    np.random.shuffle(ad_sets)

    if shuffle:
        dataset = pd.DataFrame(columns=df.columns)
        for ad in ad_sets:
            ad_df = df[df[train_info["groups"]] == ad].sort_values(train_info["sort_by"])
            dataset = pd.concat([dataset, ad_df])

        dtypes = df.dtypes
        dataset = dataset.astype(dtypes)
    else:
        dataset = df.copy()

    dataset = dataset.reset_index(drop=True)
    return dataset


def stratified_split(data, train_info):
    """Splits the data into training and testing sets while maintaining class proportions.

    Args:
        data (pandas.DataFrame): Input dataset.
        train_info (dict): Information about the data setup.

    Returns:
        tuple: Tuple containing X_train, y_train, X_test, and y_test.
    """
    X = data[train_info["train_fields"]]
    y = data[train_info["label_field"]]

    groups = data[train_info["groups"]]
    gss = GroupShuffleSplit(n_splits=1, test_size=train_info["test_ratio"], random_state=42)

    for train_index, test_index in gss.split(X, y, groups=groups):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    return X_train, y_train, X_test, y_test


def split_data(data, train_info):
    """Splits the data into training, validation, and testing sets.

    Args:
        data (pandas.DataFrame): Input dataset.
        train_info (dict): Information about the data setup.

    Returns:
        tuple: Tuple containing X_train, y_train, X_val, y_val, X_test, and y_test.
    """
    X_train, y_train, X_test, y_test = stratified_split(data, train_info)
    val_ratio = train_info["val_ratio"]
    val_ratio /= (1 - train_info["test_ratio"])

    X_train, y_train, X_val, y_val = stratified_split(data.loc[y_train.index, :], train_info)

    data.loc[y_test.index, :].to_csv("test.csv", index=False)
    
    assert set(X_train.index).isdisjoint(set(X_val.index)), "Training and validation data overlap!"
    assert set(X_train.index).isdisjoint(set(X_test.index)), "Training and testing data overlap!"
    assert set(X_val.index).isdisjoint(set(X_test.index)), "Validation and testing data overlap!"
    assert len(data) == len(X_train) + len(X_val) + len(X_test), "Data split is incorrect!"

    return X_train, y_train, X_val, y_val, X_test, y_test


def tune_model(model, hyperparameters, X_train_val, y_train_val, val_fold):
    """Tunes the hyperparameters of a given model using validation data.

    Args:
        model: Classifier model.
        hyperparameters (dict): Hyperparameters to be tuned.
        X_train_val (numpy.ndarray): Training and validation features.
        y_train_val (numpy.ndarray): Training and validation labels.
        val_fold (numpy.ndarray): Validation fold information.

    Returns:
        tuple: Tuple containing the best tuned model, its best score, and the best parameters.
    """
    grid_search = GridSearchCV(model, hyperparameters, scoring="accuracy", cv=val_fold)
    grid_search.fit(X_train_val, y_train_val)

    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    best_params = grid_search.best_params_

    return best_model, best_score, best_params


def show_results(model_results, data_type="Val"):
    """Prints the results of model evaluation.

    Args:
        model_results (dict): Dictionary containing model names and their respective results.
        data_type (str, optional): Type of data (e.g., "Val", "Test"). Defaults to "Val".
    """
    print(f"\n{data_type} Results:")
    for name, metric in model_results.items():
        print(f"{name:<25} {metric:.2%}")


def train_and_evaluate(X_train, X_val, y_train, y_val, model_info):
    """Trains classifiers, evaluates their performance, and returns trained models.

    Args:
        X_train (numpy.ndarray): Training features.
        X_val (numpy.ndarray): Validation features.
        y_train (numpy.ndarray): Training labels.
        y_val (numpy.ndarray): Validation labels.
        model_info (dict): Information about the models and hyperparameters.

    Returns:
        tuple: Tuple containing validation results, trained models.
    """
    val_results = {}
    trained_models = {}

    X_train_val = np.concatenate((X_train, X_val))
    y_train_val = np.concatenate((y_train, y_val))

    val_fold = [-1] * len(X_train) + [0] * len(X_val)
    splitter = PredefinedSplit(test_fold=val_fold)

    for name, clf in tqdm(model_info["models"].items(), desc="Tuning models"):
        hyperparameters = model_info["hyperparameters"][name]
        tuned_model, best_score, best_params = tune_model(clf, hyperparameters, X_train_val, y_train_val, splitter)
        
        tuned_model.set_params(**best_params)
        tuned_model.fit(X_train_val, y_train_val)
        
        trained_models[name] = tuned_model
        val_results[name] = best_score

    return val_results, trained_models


def test(X_test, y_test, trained_models):
    """Tests trained models on test data and returns the results.

    Args:
        X_test (numpy.ndarray): Test features.
        y_test (numpy.ndarray): Test labels.
        trained_models (dict): Dictionary containing trained models.

    Returns:
        dict: Dictionary containing test results for each model.
    """
    test_results = {}

    for name, clf in tqdm(trained_models.items(), desc="Testing models"):
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        test_results[name] = accuracy

    return test_results


def main(database=DATA,
         data_setup=TRAIN_INFO,
         model_setup=MODEL_INFO,
         ):
    """Main function to execute the workflow.

    Args:
        database: Data to be used in the analysis.
        data_setup (dict): Information about the data setup.
        model_setup (dict): Information about the model setup.
    """
    data = prepare_data(database, data_setup)

    X_train, y_train, X_val, y_val, X_test, y_test = split_data(data, data_setup)
    val_results, trained_models = train_and_evaluate(X_train, X_val, y_train, y_val, model_setup)
    test_results = test(X_test, y_test, trained_models)
    
    show_results(val_results, data_type="Val")
    show_results(test_results, data_type="Test")

    if model_setup["save"]:
        save_model(trained_models)


def save_model(models, save_path="model_parameters.pkl"):
    """Save trained model parameters to a file."""
    with open(save_path, "wb") as file:
        pickle.dump(models, file)


def load_model_parameters(load_path="model_parameters.pkl"):
    """Load trained model parameters from a file."""
    with open(load_path, "rb") as file:
        models = pickle.load(file)
    return models

main()

In [19]:
load_model_parameters()

{'K-Nearest Neighbors': KNeighborsClassifier(metric='euclidean'),
 'Logistic Regression': LogisticRegression(C=0.1, penalty='l1', solver='liblinear'),
 'Random Forest': RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=10),
 'Gradient Boosting': GradientBoostingClassifier(learning_rate=0.2, max_depth=5, min_samples_split=5)}

#### Feature Evaluation

In [18]:
def perform_ks_test(data, train_info, feature_subset):
    """Main function to execute the workflow.

    Args:
        database (pandas DataFrame): Data to be used in the analysis.
        data_setup (dict): Information about the data setup.
        model_setup (dict): Information about the model setup.
        feature_subset (list): Fields, on which the KS test is conducted.

    Returns:
        pandas DataFrame: DataFrame containing the results of the KS test for each specified feature.
    """
    data = prepare_data(data, train_info)
    X_train, y_train, X_val, y_val, X_test, y_test = split_data(data, train_info)

    train_data = data[data[train_info["label_field"]] == 1]
    val_data = data[data[train_info["label_field"]] == 0]

    results = []
    for feature_name in feature_subset:
        result = {}
        result['Feature'] = feature_name

        train_feature = train_data[feature_name]
        val_feature = val_data[feature_name]

        ks_statistic, p_value = ks_2samp(train_feature, val_feature)

        result['KS Statistic'] = ks_statistic
        result['P-value'] = p_value

        if p_value < 0.05:
            result['Result'] = 'Reject null hypothesis'
        else:
            result['Result'] = 'Fail to reject null hypothesis'

        results.append(result)

    results_df = pd.DataFrame(results)
    return results_df

subset = ["purchases_incr", "purchases_cumm_incr", "roas_sub", "roas_sub_incr", "cpl_sub_incr", "cpm_sub_incr", "ctr_sub_incr"]
results_df = perform_ks_test(DATA, TRAIN_INFO, subset)
results_df

Unnamed: 0,Feature,KS Statistic,P-value,Result
0,purchases_incr,0.057318,8.620212999999999e-38,Reject null hypothesis
1,purchases_cumm_incr,0.062119,2.567587e-44,Reject null hypothesis
2,roas_sub,0.597375,0.0,Reject null hypothesis
3,roas_sub_incr,0.491055,0.0,Reject null hypothesis
4,cpl_sub_incr,0.150186,5.25854e-258,Reject null hypothesis
5,cpm_sub_incr,0.048826,1.551717e-27,Reject null hypothesis
6,ctr_sub_incr,0.072413,4.392954e-60,Reject null hypothesis
