### Описание полей

Label - целевая метка
period - год и месяц
subject_type - тип региона
subject_name - название региона
city_name - название города
hex - код гексогона
hex_lat - широта гексогона
hex_lon - долгота гексогона
f1-f30 - дополнительная информация о клиенте

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklift.metrics import uplift_at_k
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier

In [2]:
df_train = pd.read_csv('./data/train.csv', delimiter=';')
df_test = pd.read_csv('./data/test.csv', delimiter=';')

# df_train = pd.read_csv('./data/train_catugra.csv', delimiter=',')
# df_test = pd.read_csv('./data/test_catugra.csv', delimiter=',')

df_train.info(), df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294258 entries, 0 to 294257
Data columns (total 38 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   label         294258 non-null  int64  
 1   period        294258 non-null  object 
 2   subject_type  294258 non-null  object 
 3   subject_name  294258 non-null  object 
 4   city_name     294247 non-null  object 
 5   hex           294258 non-null  object 
 6   hex_lat       294258 non-null  float64
 7   hex_lon       294258 non-null  float64
 8   f1            258144 non-null  float64
 9   f2            262930 non-null  float64
 10  f3            265786 non-null  float64
 11  f4            279564 non-null  float64
 12  f5            279564 non-null  float64
 13  f6            281984 non-null  float64
 14  f7            148829 non-null  float64
 15  f8            280209 non-null  float64
 16  f9            280209 non-null  float64
 17  f10           294258 non-null  float64
 18  f11 

(None, None)

In [3]:
df_train

Unnamed: 0,label,period,subject_type,subject_name,city_name,hex,hex_lat,hex_lon,f1,f2,...,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30
0,1,2020-05-01,Город,Москва,Москва,8611aa7a7ffffff,55.729458,37.516569,0.00101,0.00103,...,0.13027,0.00000,0.00000,,,,,,0.01737,0.0
1,1,2020-05-01,Город,Москва,Москва,8611aa01fffffff,55.975851,37.237085,0.00000,0.00027,...,0.08756,0.00000,0.00000,,,,,,0.01152,0.0
2,1,2020-05-01,Город,Москва,Москва,861181b6fffffff,55.622721,37.695121,0.00339,0.00313,...,0.09243,0.00000,0.00000,0.11053,0.57895,0.00526,0.02105,0.00000,0.01540,0.0
3,1,2020-05-01,Город,Москва,Москва,8611aa017ffffff,55.941586,37.157487,0.00048,0.00054,...,0.10192,0.00000,0.00049,,,,,,0.01495,0.0
4,1,2020-05-01,Город,Москва,Москва,8611aa637ffffff,55.797494,37.676200,0.00164,0.00179,...,0.09620,0.00000,0.00000,0.14444,0.64444,0.01111,0.04444,0.01111,0.01266,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294253,0,2020-12-01,Край,Хабаровский,Рощино,8614d6497ffffff,48.364768,134.996232,0.00111,0.00100,...,0.41667,0.26667,0.00000,0.33929,0.78571,0.00000,0.01786,0.03571,0.01667,0.0
294254,0,2020-12-01,Край,Приморский,Шкотово,86316c8cfffffff,43.309803,132.345989,0.00075,0.00065,...,0.42424,0.08333,0.00000,0.09449,0.80315,0.00787,0.00000,0.05512,0.02273,0.0
294255,0,2020-12-01,Край,Приморский,Партизанск,862ecdb2fffffff,43.057826,133.155582,0.00018,0.00014,...,0.20000,0.08333,0.00000,0.07547,0.81132,0.00000,0.00000,0.01887,0.00000,0.0
294256,0,2020-12-01,Край,Камчатский,Вилючинск,861616747ffffff,52.908685,158.406443,0.00075,0.00138,...,0.31990,0.08768,0.00000,,,,,,0.06398,0.0


### Подготовка данных

#### Время

In [4]:
df_train['period'] = (pd.to_datetime(df_train['period']) - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
df_test['period'] = (pd.to_datetime(df_test['period']) - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')

df_train

Unnamed: 0,label,period,subject_type,subject_name,city_name,hex,hex_lat,hex_lon,f1,f2,...,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30
0,1,1588291200,Город,Москва,Москва,8611aa7a7ffffff,55.729458,37.516569,0.00101,0.00103,...,0.13027,0.00000,0.00000,,,,,,0.01737,0.0
1,1,1588291200,Город,Москва,Москва,8611aa01fffffff,55.975851,37.237085,0.00000,0.00027,...,0.08756,0.00000,0.00000,,,,,,0.01152,0.0
2,1,1588291200,Город,Москва,Москва,861181b6fffffff,55.622721,37.695121,0.00339,0.00313,...,0.09243,0.00000,0.00000,0.11053,0.57895,0.00526,0.02105,0.00000,0.01540,0.0
3,1,1588291200,Город,Москва,Москва,8611aa017ffffff,55.941586,37.157487,0.00048,0.00054,...,0.10192,0.00000,0.00049,,,,,,0.01495,0.0
4,1,1588291200,Город,Москва,Москва,8611aa637ffffff,55.797494,37.676200,0.00164,0.00179,...,0.09620,0.00000,0.00000,0.14444,0.64444,0.01111,0.04444,0.01111,0.01266,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294253,0,1606780800,Край,Хабаровский,Рощино,8614d6497ffffff,48.364768,134.996232,0.00111,0.00100,...,0.41667,0.26667,0.00000,0.33929,0.78571,0.00000,0.01786,0.03571,0.01667,0.0
294254,0,1606780800,Край,Приморский,Шкотово,86316c8cfffffff,43.309803,132.345989,0.00075,0.00065,...,0.42424,0.08333,0.00000,0.09449,0.80315,0.00787,0.00000,0.05512,0.02273,0.0
294255,0,1606780800,Край,Приморский,Партизанск,862ecdb2fffffff,43.057826,133.155582,0.00018,0.00014,...,0.20000,0.08333,0.00000,0.07547,0.81132,0.00000,0.00000,0.01887,0.00000,0.0
294256,0,1606780800,Край,Камчатский,Вилючинск,861616747ffffff,52.908685,158.406443,0.00075,0.00138,...,0.31990,0.08768,0.00000,,,,,,0.06398,0.0


#### Аналог модели Саши без допданных

In [5]:
N = 5
top5p = int(df_test.shape[0] * 0.05)
random_res = []

for i in range(N):
    preds = df_test.label.sample(df_test.shape[0]).reset_index(drop=True)
    df_test['pred'] = preds
    random_res.append(df_test.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p)

np.mean(random_res), random_res

(0.00968840010473946,
 [0.010473946059177796,
  0.01073579471065724,
  0.00968840010473946,
  0.007331762241424456,
  0.01021209740769835])

In [6]:
cat_features = df_train.select_dtypes(include=['object', 'category']).columns.tolist()
df_train[cat_features] = df_train[cat_features].fillna('Unknown')


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294258 entries, 0 to 294257
Data columns (total 38 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   label         294258 non-null  int64  
 1   period        294258 non-null  int64  
 2   subject_type  294258 non-null  object 
 3   subject_name  294258 non-null  object 
 4   city_name     294258 non-null  object 
 5   hex           294258 non-null  object 
 6   hex_lat       294258 non-null  float64
 7   hex_lon       294258 non-null  float64
 8   f1            258144 non-null  float64
 9   f2            262930 non-null  float64
 10  f3            265786 non-null  float64
 11  f4            279564 non-null  float64
 12  f5            279564 non-null  float64
 13  f6            281984 non-null  float64
 14  f7            148829 non-null  float64
 15  f8            280209 non-null  float64
 16  f9            280209 non-null  float64
 17  f10           294258 non-null  float64
 18  f11 

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop('label',1), df_train['label'], test_size=0.3, random_state=42)


  X_train, X_test, y_train, y_test = train_test_split(df_train.drop('label',1), df_train['label'], test_size=0.3, random_state=42)


In [9]:
model = CatBoostClassifier(cat_features=cat_features, verbose=200)
model.fit(df_train.drop('label',1), df_train['label'])


  model.fit(df_train.drop('label',1), df_train['label'])


Learning rate set to 0.116703
0:	learn: 0.5048202	total: 181ms	remaining: 3m
200:	learn: 0.0867089	total: 20.4s	remaining: 1m 20s
400:	learn: 0.0827169	total: 36.8s	remaining: 54.9s
600:	learn: 0.0796330	total: 53.6s	remaining: 35.6s
800:	learn: 0.0766326	total: 1m 10s	remaining: 17.4s
999:	learn: 0.0738079	total: 1m 26s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x13feb5340>

In [10]:
y_predict = model.predict(X_test)

In [11]:
y_predict

array([0, 0, 0, ..., 0, 0, 0])

In [12]:
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix

def evaluate_results(y_test, y_predict):
    """ Проверка качества модели с помощью DS-метрик"""
    from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score

    f1 = f1_score(y_test, y_predict)
    roc = roc_auc_score(y_test, y_predict)
    precision = precision_score(y_test, y_predict, average='binary')
    recall = recall_score(y_test, y_predict, average='binary')

    return f1, round(roc,2), precision, recall

In [13]:
evaluate_results(y_test, y_predict)

(0.24611610793131644, 0.57, 0.9741100323624595, 0.14085166120729994)

In [14]:
val_df = pd.read_csv('./data/test.csv', delimiter=';')
val_df['period'] = (pd.to_datetime(val_df['period']) - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
val_df[cat_features] = val_df[cat_features].fillna('Unknown')

preds = model.predict_proba(val_df.drop('label', axis=1))[:,1]
val_df['pred'] = preds

In [15]:
top5p = int(val_df.shape[0] * 0.05)
res_response = val_df.sort_values('pred', ascending=False).iloc[:top5p].label.sum()/top5p
res_response

0.096229379418696

In [16]:
average_precision_score(val_df['label'], preds)

0.06964379050964528

### Uplift

In [17]:
from sklearn.utils.validation import check_consistent_length
def plot_uplift_preds_(trmnt_preds, ctrl_preds, log=False, bins=100):
    """Plot histograms of treatment, control and uplift predictions.

    Args:
        trmnt_preds (1d array-like): Predictions for all observations if they are treatment.
        ctrl_preds (1d array-like): Predictions for all observations if they are control.
        log (bool): Logarithm of source samples. Default is False.
        bins (integer or sequence): Number of histogram bins to be used. Default is 100.
            If an integer is given, bins + 1 bin edges are calculated and returned.
            If bins is a sequence, gives bin edges, including left edge of first bin and right edge of last bin.
            In this case, bins is returned unmodified. Default is 100.

    Returns:
        Object that stores computed values.
    """

    # TODO: Add k as parameter: vertical line on plots
    check_consistent_length(trmnt_preds, ctrl_preds)

    if not isinstance(bins, int) or bins <= 0:
        raise ValueError(
            f'Bins should be positive integer. Invalid value for bins: {bins}')

    if log:
        trmnt_preds = np.log(trmnt_preds + 1)
        ctrl_preds = np.log(ctrl_preds + 1)

    fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(20, 7))
    axes[0].hist(
        trmnt_preds, bins=bins, alpha=0.3, color='b', label='Treated', histtype='stepfilled')
    axes[0].set_ylabel('Probability hist')
    axes[0].legend()
    axes[0].set_title('Treatment predictions')

    axes[1].hist(
        ctrl_preds, bins=bins, alpha=0.5, color='y', label='Not treated', histtype='stepfilled')
    axes[1].legend()
    axes[1].set_title('Control predictions')

    axes[2].hist(
        trmnt_preds - ctrl_preds, bins=bins, alpha=0.5, color='green', label='Uplift', histtype='stepfilled')
    axes[2].legend()
    axes[2].set_title('Uplift predictions')

    return axes

In [18]:
train_one = pd.read_csv('./data/train.csv', delimiter=';')
test = pd.read_csv('./data/test.csv', delimiter=';')

In [19]:
train_zero = train_one.copy()
train_one['treatment'] = 1
train_zero['treatment'] = 0

In [21]:
train_all = pd.concat([train_zero, train_one], axis=0).sample(300000)
train_all = train_all.sample(frac=1).reset_index(drop=True)

train_all['period'] = (pd.to_datetime(train_all['period']) - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
train_all[cat_features] = train_all[cat_features].fillna('Unknown')

# train_all = train_all.drop('Unnamed: 0',1)
train_all

Unnamed: 0,label,period,subject_type,subject_name,city_name,hex,hex_lat,hex_lon,f1,f2,...,f22,f23,f24,f25,f26,f27,f28,f29,f30,treatment
0,0,1604188800,Республика,Хакасия,Абакан,8625260efffffff,53.728903,91.390959,0.00078,0.00077,...,0.00388,0.00000,,,,,,0.01357,0.0000,1
1,0,1596240000,Республика,Мордовия,Саранск,8610a0727ffffff,54.203445,45.134350,0.00015,0.00025,...,0.05492,0.00000,,,,,,0.01398,0.0000,1
2,1,1606780800,Край,Камчатский,Петропавловск-Камчатский,861616237ffffff,53.078910,158.684128,0.00018,0.00034,...,0.09138,0.00000,,,,,,0.05918,0.0008,0
3,0,1604188800,Область,Иркутская,Братск,860a91cd7ffffff,56.173292,101.615950,0.00087,0.00077,...,0.00331,0.00066,0.02063,0.60046,0.00306,0.00993,0.88694,0.00927,0.0000,1
4,0,1596240000,Область,Оренбургская,Саракташ,862120d97ffffff,51.780432,56.323167,0.00033,0.00036,...,0.05677,0.00000,0.12097,0.89516,0.00000,0.02419,0.04032,0.01747,0.0000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0,1606780800,Край,Приморский,Владивосток,86316c087ffffff,43.098244,131.960035,0.00029,0.00126,...,0.13818,0.00000,0.11558,0.72362,0.00502,0.01005,0.02513,0.02546,0.0000,1
299996,0,1596240000,Республика,Удмуртская,Камбарка,8610f286fffffff,56.298126,54.221195,0.00006,0.00006,...,0.05251,0.00000,,,,,,0.00228,0.0000,1
299997,0,1590969600,Область,Московская,Химки,8611aa727ffffff,55.869897,37.417020,0.00104,0.00149,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0
299998,0,1590969600,Область,Брянская,Клинцы,8611b421fffffff,52.779633,32.202767,0.00018,0.00015,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.01339,0.0000,1


In [22]:
indices = train_all.index
indices_train, indices_test = train_test_split(train_all.index, test_size = 0.3, random_state = 42)

In [23]:
X_train = train_all.loc[indices_train, :].drop(columns = ['label','treatment'])
y_train = train_all.loc[indices_train, 'label']
treat_train = train_all.loc[indices_train, 'treatment']

X_test = train_all.loc[indices_test, :].drop(columns = ['label','treatment'])
y_test = train_all.loc[indices_test, 'label']
treat_test =  train_all.loc[indices_test, 'treatment']

In [24]:
models_results = {'approach' : [],
                  'uplift@05%': []}

##### SoloMdl

In [25]:
from sklift.models import SoloModel

In [None]:
sm = SoloModel(CatBoostClassifier(iterations=500, thread_count=2, random_state=42, silent=True))
sm = sm.fit(X_train, y_train, treat_train, estimator_fit_params={'cat_features': cat_features})
uplift_sm = sm.predict(X_test)
sm_score_1 = uplift_at_k(y_true=y_test, uplift=uplift_sm, treatment=treat_test, strategy='by_group', k=0.05)

# и добавим значения в наш словарь
models_results['approach'].append('SoloModel')
models_results['uplift@05%'].append(sm_score_1)

plot_uplift_preds_(trmnt_preds=sm.trmnt_preds_, ctrl_preds=sm.ctrl_preds_)

##### TwoMdl

In [None]:
from sklift.models import TwoModels

In [None]:
tm = TwoModels(
    estimator_trmnt=CatBoostClassifier(iterations=10, thread_count=2, random_state=42, silent=True),
    estimator_ctrl=CatBoostClassifier(iterations=10, thread_count=2, random_state=42, silent=True),
    method='vanilla'
)

In [None]:
tm = tm.fit(
    X_train, y_train, treat_train,
    estimator_trmnt_fit_params={'cat_features': cat_features},
    estimator_ctrl_fit_params={'cat_features': cat_features}
)

In [None]:
uplift_tm = tm.predict(X_test)

tm_score_1 = uplift_at_k(y_true=y_test, uplift=uplift_tm, treatment=treat_test, strategy='by_group', k=0.05)

models_results['approach'].append('TwoModels')
models_results['uplift@05%'].append(tm_score_1)

# Отобразим результаты на графике
plot_uplift_preds_(trmnt_preds=tm.trmnt_preds_, ctrl_preds=tm.ctrl_preds_)

#### Трансформация класса

In [None]:
from sklift.models import ClassTransformation

ct = ClassTransformation(CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True))
ct = ct.fit(X_train, y_train, treat_train, estimator_fit_params={'cat_features': cat_features})

uplift_ct = ct.predict(X_test)

ct_score_1 = uplift_at_k(y_true=y_test, uplift=uplift_ct, treatment=treat_test, strategy='by_group', k=0.05)

models_results['approach'].append('ClassTransform')
models_results['uplift@05%'].append(ct_score_1)

In [None]:
pd.DataFrame(data=models_results)