In [1]:
1+1

2

In [2]:
from weight_of_evidence.tree_binner import TreeBinner

In [3]:
from sklearn.pipeline import Pipeline
from category_encoders.woe import WOEEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import yaml

In [4]:
def prepare_data(config):
    data = pd.read_csv(config['data_path'] , 
                       sep=" ", names=config['feature_names'])

    data["response"] = data["response"] - 1

    return data.drop(columns=['response']),data['response']


In [5]:
CONFIG = yaml.safe_load(open('config.yaml','r'))

In [6]:
X, y = prepare_data(CONFIG)

In [None]:
AUTO_BIN_PIPELINE = Pipeline([
    ('tree_binner', TreeBinner(max_depth=5,min_samples_split=4,min_samples_leaf=4,category_type='str')),
    ('woe_encoder', WOEEncoder(regularization=1)),
    ('standard_scaler', StandardScaler()),
    ('logistic_regression', LogisticRegression(max_iter=10_000,C=0.01))
])

In [None]:
text_columns_indices = [index for index, dtype in enumerate(X.dtypes) if dtype == 'object']
numerical_columns_indices = [index for index, dtype in enumerate(X.dtypes) if dtype != 'object']
preprocessor = ColumnTransformer(
    transformers=[
        ('text', OneHotEncoder(handle_unknown='ignore'), text_columns_indices),
        ('num', StandardScaler(with_mean=False), numerical_columns_indices)
    ])

LINEAR_PIPELINE = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=10_000,random_state=42))
])

In [None]:
XGB_Pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42))
])

In [None]:
PIPELINES = {'xgb':XGB_Pipeline,'linear':LINEAR_PIPELINE,'auto_bin':AUTO_BIN_PIPELINE}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def find_best_model_params(model, params_grid, X, y, cv=5, n=50, scoring='roc_auc'):
    random_search = RandomizedSearchCV(
        model, 
        param_distributions=params_grid, 
        n_iter=n, 
        cv=cv, 
        scoring=scoring, 
        n_jobs=-1, 
        verbose=1, 
        random_state=42
    )
    random_search.fit(X, y)
    print(f'Best score: {random_search.best_score_:.3f}')
    print(f'Best parameters: {random_search.best_params_}')
    return random_search.best_params_

In [None]:
TREE_BIN_PARAMS_GRID = {
    'tree_binner__max_depth': sp_randint(2, 6),
    'tree_binner__min_samples_leaf': sp_randint(1, 5),
    'tree_binner__min_impurity_decrease': sp_uniform(0, 1),
    'woe_encoder__regularization': sp_uniform(0, 1),
    'logistic_regression__C': np.logspace(-3, 2, 6),
}

In [7]:
test = pd.DataFrame(
        {
            "a": ["a", "a", "b", "b", "c", "d"],
            "b": ["x", "y", "x", "x", "z", "z"],
            "c": ["dog", "cat", "dog", "fish", "fish", "fish"],
            "d": ["blue", "red", "green", "green", "yellow", "yellow"],
            "e": ["sheep", "sheep", "sheep", "sheep", "sheep", "sheep"],
        }
    )

In [13]:
N = len(test)

In [20]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd


In [43]:
class Otherizer(BaseEstimator, TransformerMixin):
    def __init__(self, threshold: float = 0.1):
        """
        :param threshold: The threshold below which a value is replaced with 'other'. Default is 0.1, i.e., any value
                          that appears in less than 10% of columns is replaced.
        """
        self.threshold = threshold
        self.common_strings = {}

    def fit(self, X, y=None):
        """
        Fit the transformer on the input data and identify common strings for each column.
        """
        N = len(X)
        for col in X.select_dtypes("object").columns:
            counts = pd.Series(X[col]).value_counts()
            common_strings = counts[(counts / N) >= self.threshold].index
            self.common_strings[col] = list(common_strings)
        return self

    def transform(self, X):
        """
        Transform the input data, replacing uncommon strings with 'other'.
        """
        X_transformed = pd.DataFrame(X.copy())
        for col, common_strings in self.common_strings.items():
            print(col)
            print(common_strings)
            X_transformed[col] = np.where(X[col].isin(common_strings), col, "other")
        return X_transformed


In [46]:
ot = Otherizer(0.2)

In [47]:
ot.fit_transform(test)

a
['a', 'b']
b
['x', 'z']
c
['fish', 'dog']
d
['green', 'yellow']
e
['sheep']


Unnamed: 0,a,b,c,d,e
0,a,b,c,other,e
1,a,other,other,other,e
2,a,b,c,d,e
3,a,b,c,d,e
4,other,b,c,d,e
5,other,b,c,d,e


In [32]:
ot.common_strings

{'a': ['a', 'b'],
 'b': ['x', 'z'],
 'c': ['fish', 'dog'],
 'd': ['green', 'yellow'],
 'e': ['sheep']}

In [17]:
counts = pd.Series(test['d']).value_counts()
common_strings = counts[(counts / N) >= 1].index

In [18]:
common_strings

Index([], dtype='object')

In [None]:
BEST_PARAMS_WOE = find_best_model_params(AUTO_BIN_PIPELINE, TREE_BIN_PARAMS_GRID, X_train, y_train,n=200)

In [None]:
PIPELINES['auto_bin'] = AUTO_BIN_PIPELINE.set_params(**BEST_PARAMS_WOE)

In [None]:
for name, pipeline in PIPELINES.items():
    print(f'Pipeline: {name}')
    print(f'CV score: {cross_val_score(pipeline, X_train, y_train, scoring="roc_auc", cv=5).mean():.3f}')

In [None]:


#BEST_PARAMS_XGB = find_best_model_params(AUTO_BIN_PIPELINE, CONFIG['params_grid'], X_train, y_train)

In [None]:
cross_val_score(AUTO_BIN_PIPELINE, X_train, y_train, scoring='roc_auc', cv=5).mean()

In [None]:
X.select_dtypes('object').nunique()

In [None]:
cross_val_score(BASELINE_PIPELINE, X, y, scoring='roc_auc', cv=5).mean()