In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.linear_model import LogisticRegression
import pickle
import json
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
import h3
import shapely
import geopandas as gpd

## Загружаем трейн данные

In [None]:
data_dir = Path("..","data")
random_state=20240225

In [None]:
transactions_df = pd.read_parquet(data_dir / "transactions.parquet")
# Заполняем std = 0 где одна транзакция
transactions_df['std'].fillna(0, inplace=True)
# transactions_df.head()

In [None]:
target_df = pd.read_parquet(data_dir / "target.parquet")
# target_df.head()

In [None]:
# Собираем список всех доступных Hexes
# список всех 1658 геолокаций, где возможно снятие наличности, нужно разметить эти локации в решении
# Есть 3 локации, по которым нет транзакций set(hexses_target).difference(transactions_df.h3_09)
with open(data_dir / "hexses_target.lst", "r") as file:
    file_contents = file.read()
hexses_target = file_contents[:-1].split("\n") # remove /n
assert set(hexses_target)==set(target_df.h3_09)

# Cписок всех 8154 геолокаций h3_09 из transactions_df
with open(data_dir / "hexses_data.lst", "r") as file:
    file_contents = file.read()
hexses_data = file_contents[:-1].split("\n")

all_hexses = list(set(hexses_target) | set(hexses_data))
# all_hexses = pd.DataFrame({"h3_09":all_hexses})
all_hexses = gpd.GeoDataFrame({"h3_09":all_hexses})
all_hexses["geometry"] = all_hexses["h3_09"].apply(lambda x: shapely.geometry.Polygon(h3.h3_to_geo_boundary(x, geo_json=True)))
all_hexses[['lat', 'lon']] = all_hexses['h3_09'].apply(lambda x: pd.Series(h3.h3_to_geo(x)))


In [None]:
# target.groupby(by='h3_09').size().hist(bins=30)
# target.groupby(by='customer_id').size().hist(bins=30)

# Обучаем модель

In [None]:
class BaseTransform:
    def __init__(self, filepath=None):
        if filepath:
            self.load(filepath)
        pass
    
    def fit(self, transactions):
        pass
    
    def transform(self, transactions):
        return transactions
    
    def save(self, filepath):
        pass
    
    def load(self, filepath):
        pass

class BaseModel:
    def __init__(self, transform=None, filepath=None):
        self.transform = transform if transform else BaseTransform()
        if filepath:
            self.load(filepath)
        pass
   
    def fit(self, transactions, target):
        pass
        
    def predict(self, transactions):
        pass

    def score(self, transactions, target, return_raw_score=False):
        labels = (
            target
            .assign(customer_id = lambda x: x.customer_id.astype(int))
            .pipe(lambda x: pd.pivot(x.assign(v = 1.), index='customer_id', columns='h3_09', values='v'))
            .pipe(lambda x: pd.concat([x, pd.DataFrame({col: np.zeros(len(x)) for col in hexses_target if col not in x.columns}, index=x.index)], axis=1))
            .pipe(lambda x: x.reindex(sorted(x.columns), axis=1)) # Сортируем столбцы
            .sort_values(by='customer_id')
            .fillna(0)).values
        
        predict = (
            self.predict(transactions)
            .pipe(lambda x: x.reindex(sorted(x.columns), axis=1)) # Сортируем столбцы
            .sort_values(by='customer_id')
            .set_index("customer_id")
        )
        row_score = (
            -np.log(predict.clip(1e-8, 1 - 1e-8)) * labels
            -np.log(1-predict.clip(1e-8, 1 - 1e-8)) * (1 - labels)
        ).sum(axis=1)
        col_score = (
            -np.log(predict.clip(1e-8, 1 - 1e-8)) * labels
            -np.log(1-predict.clip(1e-8, 1 - 1e-8)) * (1 - labels)
        ).sum(axis=0)

        score = row_score.mean()
        if return_raw_score:
            return score, row_score, col_score, predict
        else: 
            return score

    def save(self, filepath: Path):
        pass
        
    def load(self, filepath: Path):
        pass

In [None]:
# Добавить фичи: сумма всех трат клиента, кол-во транзакций клиента, кол-во хексов где были транзакции клиента, 
# Округлить 'sum', 'avg', 'min', 'max' до целых 
class SimpleFeaturesTransform(BaseTransform):
    def __init__(self, filepath=None):
        if filepath:
            self.load(filepath)
        pass
    
    def fit(self, transactions):
        self.mcc_codes = set(transactions.mcc_code.unique())
        self.datetime_ids = set(transactions.datetime_id.unique())
        self.hexes = set(hexses_target)
        chunk = dict()
        for mcc_code in self.mcc_codes:
            chunk[f"mcc_{ mcc_code }"] = 0
        for datetime_id in self.datetime_ids:
            chunk[f"dt_{ datetime_id }"] = 0
        for h3_09 in self.hexes:
            chunk[f"hex_{ h3_09 }_count"] = 0
        for h3_09 in self.hexes:
            chunk[f"hex_{ h3_09 }_sum"] = 0
        self.template = chunk
    
    def transform(self, transactions):
        features = []
        row_labels = []
        gb = transactions.groupby(by="customer_id")
        for customer_id, group in gb:
            row_labels.append(customer_id)
            chunk = self.template.copy()
            chunk[f"max_median"] = group['max'].median()
            chunk[f"min_median"] = group['min'].median()
            chunk[f"avg_median"] = group['avg'].median()
            chunk[f"full_sum"] = group['sum'].median()
            # В каких мсс кодах были покупки
            for mcc_code, subgroup in group.groupby(by='mcc_code'):
                if mcc_code in self.mcc_codes:
                    chunk[f"mcc_{ mcc_code }"] = subgroup["count_distinct"].sum()/group["count_distinct"].sum() # тут было .size
            # В какое время суток были покупки
            for datetime_id, subgroup in group.groupby(by='datetime_id'):
                if datetime_id in self.datetime_ids:
                    chunk[f"dt_{ datetime_id }"] = subgroup["count_distinct"].sum()/group["count_distinct"].sum()
            # В каких локациях были покупки - детализация по количеству покупок
            for h3_09, subgroup in group.groupby(by='h3_09'):
                if h3_09 in self.hexes:
                    chunk[f"hex_{ h3_09 }_count"] = subgroup["count_distinct"].sum()/group["count_distinct"].sum()
            # В каких локациях были покупки - детализация по сумме трат (удалить, не помогает)
            for h3_09, subgroup in group.groupby(by='h3_09'):
                if h3_09 in self.hexes:
                    chunk[f"hex_{ h3_09 }_sum"] = subgroup["sum"].sum()/group["sum"].sum()
            features.append(chunk)
        return pd.DataFrame(features, index=row_labels).pipe(lambda x: x.reindex(sorted(x.columns), axis=1)) # Сортируем столбцы
    
    def save(self, filepath):
        pass
    
    def load(self, filepath):
        pass

class CBModel(BaseModel):
    def __init__(self, transform=None, filepath=None):
        super().__init__(transform, filepath)
        self.models = dict()
        
    def fit(self, transactions, target):
        labels = (
            target
            .assign(customer_id = lambda x: x.customer_id.astype(int))
            .pipe(lambda x: pd.pivot(x.assign(v = 1.), index='customer_id', columns='h3_09', values='v'))
            .pipe(lambda x: pd.concat([x, pd.DataFrame({col: np.zeros(len(x)) for col in hexses_target if col not in x.columns}, index=x.index)], axis=1)) # Добавляем недостающие локации банкоматов, таких нет, но на всякий случай
            .pipe(lambda x: x.reindex(sorted(x.columns), axis=1)) # Сортируем столбцы
            .sort_values(by='customer_id')
            .fillna(0))
        self.means = dict(pd.DataFrame({"target": labels.mean()}).target)

        self.transform.fit(transactions)
        X = self.transform.transform(transactions)
        for col in tqdm(labels.columns, desc="Train models"):
            y = labels[col]
            if sum(y) > 5:
                model = CatBoostClassifier(iterations=5, random_state=random_state, verbose=0, boosting_type="Plain", allow_writing_files=False)
                model.fit(X, y)
                self.models[col] = model
        
    def predict(self, transactions):
        X = self.transform.transform(transactions)
        submit = pd.DataFrame({"customer_id": X.index})
        for col in tqdm(hexses_target, desc="Inference models"):
            if col in self.models:
                model = self.models[col]
                submit[col] = model.predict_proba(X)[:, 1]
            else:
                submit[col] =  [self.means[col]] * len(X)
                submit = submit.copy()
        return submit

    def save(self, filepath: Path):
        pass
        
    def load(self, filepath: Path):
        pass

In [None]:
# Для ускорения ресерча - предварительно оптимизируем скор на урезанных данных (25% от всех клиентов).
# Если изменение привело к увеличению скора на этой выборке - проверяем эффект на полных данных.
fast_mode = True

if fast_mode:
    num_quantiles = 10
    n_splits=4
    customers = (
        pd.DataFrame({"cnt": target_df.groupby(by='customer_id').size()})
        .pipe(lambda x: x.assign(bin=pd.qcut(x.cnt, num_quantiles, duplicates='drop')))
    )
    customer_idxs = customers.index.values
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    groups = pd.Categorical(customers.bin).codes
    ids_train, ids_test = next(kfold.split(groups, groups))
    customers_small_list = customers.iloc[ids_test].index.values
    
    target_df = target_df[target_df.customer_id.isin(customers_small_list)]
    transactions_df = transactions_df[transactions_df.customer_id.isin(customers_small_list)]

In [None]:
scores = []
preds = []
row_scores = []
col_scores = []
model = CBModel(transform = SimpleFeaturesTransform())
num_quantiles = 10
n_splits=3
customers = (
    pd.DataFrame({"cnt": target_df.groupby(by='customer_id').size()})
    .pipe(lambda x: x.assign(bin=pd.qcut(x.cnt, num_quantiles, duplicates='drop')))
)
customer_idxs = customers.index.values
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
groups = pd.Categorical(customers.bin).codes
for fold, (ids_train, ids_test) in enumerate(kfold.split(groups, groups)):
    customers_train = customers.iloc[ids_train].index.values
    customers_test = customers.iloc[ids_test].index.values
    
    transactions_train = transactions_df.loc[transactions_df.customer_id.isin(customers_train)]
    target_train = target_df.loc[target_df.customer_id.isin(customers_train)]
    transactions_test = transactions_df.loc[transactions_df.customer_id.isin(customers_test)]
    target_test = target_df.loc[target_df.customer_id.isin(customers_test)]    
    
    model.fit(transactions_train, target_train)

    score, row_score, col_score, pred = model.score(transactions_test, target_test, return_raw_score=True)
    col_score = col_score.to_frame()
    col_score["fold"] = fold
    print()
    print(np.sort(transactions_df.customer_id.unique()), np.sort(transactions_train.customer_id.unique()), np.sort(transactions_test.customer_id.unique()))
    print(transactions_df.customer_id.unique().shape, transactions_train.customer_id.unique().shape, transactions_test.customer_id.unique().shape)
    scores.append(score)
    row_scores.append(row_score)
    col_scores.append(col_score)
    preds.append(pred)
    print(score)
preds = pd.concat(preds)
row_scores = pd.concat(row_scores).reset_index()
row_scores.columns = ["customer_id", "score"]
col_scores = pd.concat(col_scores).reset_index()
col_scores.columns = ["h3_09", "score", "fold"]

In [None]:
# Скоры на первой итерации кросс-валидации:
# 11.017292698892238 56:24 boosting_type="Ordered" iterations=5
# 12.076262937269291 43:40 boosting_type="Ordered" iterations=3
# 10.874190489487992 10:14 boosting_type="Plain" iterations=5    customers_small_list: 12.324778899109338 11.89429185482667 11.637939590418044
# 12.324067891640993

In [None]:
assert not fast_mode
model_name = "catboost_plain_5iter"
model_dir = Path("..", "models", model_name)
model_dir.mkdir(exist_ok=False, parents=True)
row_scores.to_csv(model_dir / f"row_scores.csv", index=False)
col_scores.to_csv(model_dir / f"col_scores.csv", index=False)
preds.reset_index().to_parquet(model_dir / f"preds.parquet", index=False)

In [None]:
t = SimpleFeaturesTransform()
t.fit(transactions_train.iloc[:5000])
X = t.transform(transactions_train.iloc[:5000])
X

# На подумоть

In [None]:
# Как-то использовать информацию из соседних хексов - соседние хексы определяем по расстоянию между хексами
all_hexses["centroid"] = all_hexses.centroid
all_hexses = all_hexses.set_index("h3_09")

target_hex = all_hexses.query("index=='8911aa7abd7ffff'")["centroid"].iloc[0]
all_hexses["centroid"].distance(target_hex).sort_values().iloc[:5].reset_index()