# Uplift
## Целиком ноутбук будет выполняться больше 2 недель, ГДЕ пропускать ОТМЕЧЕНО

In [1]:
import pandas as pd
import datetime
import numpy as np
from sklearn.base import clone
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
%matplotlib inline
import lightgbm as lgbm
from xgboost import XGBClassifier
from tqdm import tqdm 

start=datetime.datetime.now()

def uplift_fit_predict(model, X_train, treatment_train, target_train, X_test):
    """
    Реализация простого способа построения uplift-модели.
    
    Обучаем два одинаковых бинарных классификатора, которые оценивают вероятность target для клиента:
    1. с которым была произведена коммуникация (treatment=1)
    2. с которым не было коммуникации (treatment=0)
    
    В качестве оценки uplift для нового клиента берется разница оценок вероятностей:
    Predicted Uplift = P(target|treatment=1) - P(target|treatment=0)
    """
    X_treatment, y_treatment = X_train[treatment_train == 1, :], target_train[treatment_train == 1]
    X_control, y_control = X_train[treatment_train == 0, :], target_train[treatment_train == 0]
    model_treatment = clone(model).fit(X_treatment, y_treatment)
    model_control = clone(model).fit(X_control, y_control)
    predict_treatment = model_treatment.predict_proba(X_test)[:, 1]
    predict_control = model_control.predict_proba(X_test)[:, 1]
    predict_uplift = predict_treatment - predict_control
    return predict_uplift

def uplift_fit_predict_2_models(model_treatment, model_control, X_train, treatment_train, target_train, X_test):
    """
    Обучаем два разных бинарных классификатора, которые оценивают вероятность target для клиента:
    1. с которым была произведена коммуникация (treatment=1)
    2. с которым не было коммуникации (treatment=0)

    """
    X_treatment, y_treatment = X_train[treatment_train == 1, :], target_train[treatment_train == 1]
    X_control, y_control = X_train[treatment_train == 0, :], target_train[treatment_train == 0]
    model_treatment.fit(X_treatment, y_treatment)
    model_control.fit(X_control, y_control)
    predict_treatment = model_treatment.predict_proba(X_test)[:, 1]
    predict_control = model_control.predict_proba(X_test)[:, 1]
    predict_uplift = predict_treatment - predict_control
    return predict_uplift

def max_absolute_value_2(dataframe):
    """
    Выбор из двух серий того значения с одинаковым индексом, которое больше по модулю

    """
    buff = []
    for i in range(0, dataframe.shape[0]):
        if abs(dataframe.iloc[i, 1]) > abs(dataframe.iloc[i, 0]):
            buff.append(dataframe.iloc[i, 1])
        else:
            buff.append(dataframe.iloc[i, 0])
    return pd.DataFrame({'uplift': list(buff)}) 

def max_absolute_value_3(dataframe):
    """
    Выбор из трех серий того значения с одинаковым индексом, которое больше по модулю

    """
    buff = []
    for i in range(0, dataframe.shape[0]):
        if abs(dataframe.iloc[i, 1]) > abs(dataframe.iloc[i, 0]):
            if abs(dataframe.iloc[i, 1]) > abs(dataframe.iloc[i, 2]):
                buff.append(dataframe.iloc[i, 1])
            else:
                buff.append(dataframe.iloc[i, 2])
        else:
            if abs(dataframe.iloc[i, 0]) > abs(dataframe.iloc[i, 2]):
                buff.append(dataframe.iloc[i, 0])
            else:
                buff.append(dataframe.iloc[i, 2])
    return buff

def max_absolute_value_4(dataframe):
    """
    Выбор из 4ч серий того значения с одинаковым индексом, которое больше по модулю

    """
    buff = []
    for i in range(0, dataframe.shape[0]):
        if abs(dataframe.iloc[i, 1]) > abs(dataframe.iloc[i, 0]):
            if abs(dataframe.iloc[i, 1]) > abs(dataframe.iloc[i, 2]):
                if abs(dataframe.iloc[i, 1]) > abs(dataframe.iloc[i, 3]):
                    buff.append(dataframe.iloc[i, 1])
                else:
                    buff.append(dataframe.iloc[i, 3])
            else:
                if abs(dataframe.iloc[i, 2]) > abs(dataframe.iloc[i, 3]):
                    buff.append(dataframe.iloc[i, 2])
                else:
                    buff.append(dataframe.iloc[i, 3])
        else:
            if abs(dataframe.iloc[i, 0]) > abs(dataframe.iloc[i, 2]):
                if abs(dataframe.iloc[i, 0]) > abs(dataframe.iloc[i, 3]):
                    buff.append(dataframe.iloc[i, 0])
                else:
                    buff.append(dataframe.iloc[i, 3])
            else:
                if abs(dataframe.iloc[i, 0]) > abs(dataframe.iloc[i, 3]):
                    buff.append(dataframe.iloc[i, 2])
                else:
                    buff.append(dataframe.iloc[i, 3])
    return buff

def uplift_fit_predict_2_concurrent_models(model_treatment_1, model_treatment_2, model_control_1, model_control_2, metric,
                                           X_train, treatment_train, target_train, X_test):
    """
    Обучение для каждой группы клиентов по 2 классификатора, предсказание которых определяется по метрике:
    mean - из двух предсказаний построчно выбирается их среднее
    max - из двух предсказаний построчно выбирается их максимум
    min - из двух предсказаний построчно выбирается их минимум
    max_abs - из двух предсказаний построчно выбирается то, которое больше по модулю, с преним знаком

    """
    X_treatment, y_treatment = X_train[treatment_train == 1, :], target_train[treatment_train == 1]
    X_control, y_control = X_train[treatment_train == 0, :], target_train[treatment_train == 0]
    model_treatment_1.fit(X_treatment, y_treatment)
    model_treatment_2.fit(X_treatment, y_treatment)
    model_control_1.fit(X_treatment, y_treatment)
    model_control_2.fit(X_control, y_control)
    
    if metric == 'mean':
        predict_treatment = pd.concat([pd.DataFrame(model_treatment_1.predict_proba(X_test)[:, 1]), 
                                       pd.DataFrame(model_treatment_2.predict_proba(X_test)[:, 1])],
                                      join='outer', axis = 1).mean(axis=1)

        predict_control = pd.concat([pd.DataFrame(model_control_1.predict_proba(X_test)[:, 1]), 
                                     pd.DataFrame(model_control_2.predict_proba(X_test)[:, 1])],
                                    join='outer', axis = 1).mean(axis=1)
    if metric == 'max':
        predict_treatment = pd.concat([pd.DataFrame(model_treatment_1.predict_proba(X_test)[:, 1]), 
                                       pd.DataFrame(model_treatment_2.predict_proba(X_test)[:, 1])],
                                      join='outer', axis = 1).max(axis=1)

        predict_control = pd.concat([pd.DataFrame(model_control_1.predict_proba(X_test)[:, 1]), 
                                     pd.DataFrame(model_control_2.predict_proba(X_test)[:, 1])],
                                    join='outer', axis = 1).max(axis=1)
        
    if metric == 'min':
        predict_treatment = pd.concat([pd.DataFrame(model_treatment_1.predict_proba(X_test)[:, 1]), 
                                       pd.DataFrame(model_treatment_2.predict_proba(X_test)[:, 1])],
                                      join='outer', axis = 1).min(axis=1)

        predict_control = pd.concat([pd.DataFrame(model_control_1.predict_proba(X_test)[:, 1]), 
                                     pd.DataFrame(model_control_2.predict_proba(X_test)[:, 1])],
                                    join='outer', axis = 1).min(axis=1)
        
    if metric == 'max_abs':
        predict_treatment = max_absolute_value_2(pd.concat([pd.DataFrame(model_treatment_1.predict_proba(X_test)[:, 1]), 
                                       pd.DataFrame(model_treatment_2.predict_proba(X_test)[:, 1])],
                                      join='outer', axis = 1))

        predict_control = max_absolute_value_2(pd.concat([pd.DataFrame(model_control_1.predict_proba(X_test)[:, 1]), 
                                     pd.DataFrame(model_control_2.predict_proba(X_test)[:, 1])],
                                    join='outer', axis = 1))
    
    predict_uplift = predict_treatment - predict_control
    return predict_uplift

def uplift_score(prediction, treatment, target, rate=0.3):
    """
    Подсчет Uplift Score
    """
    order = np.argsort(-prediction)
    treatment_n = int((treatment == 1).sum() * rate)
    treatment_p = target[order][treatment[order] == 1][:treatment_n].mean()
    control_n = int((treatment == 0).sum() * rate)
    control_p = target[order][treatment[order] == 0][:control_n].mean()
    score = treatment_p - control_p
    return score

def important_feats_for_model(model, features):
    """
    Определение наиболее важных признаков для модели
    """
    X_train=features.loc[indices_train, :].fillna(0).values,
    treatment_train=df_train.loc[indices_train, 'treatment_flg'].values,
    target_train=df_train.loc[indices_train, 'target'].values,
    X_treatment, y_treatment = X_train[treatment_train == 1], target_train[treatment_train == 1]
    X_control, y_control = X_train[treatment_train == 0], target_train[treatment_train == 0]
    model_treatment = clone(model).fit(X_treatment, y_treatment)
    model_control = clone(model).fit(X_control, y_control)
    print(pd.DataFrame(model_treatment.feature_importances_, features.columns))

In [2]:
# Чтение данных
df_train = pd.read_csv('data/uplift_train.csv', index_col='client_id')
df_test = pd.read_csv('data/uplift_test.csv', index_col='client_id')

# МОЖНО ПРОПУСТИТЬ

In [3]:
# Добавляю данные о покупках и товарах
df_products = pd.read_csv('data/products.csv')
df_purchases = pd.read_csv('data/purchases.csv')
df_clients = pd.read_csv('data/clients.csv', index_col='client_id')

In [4]:
# Извлечение признаков
df_clients['first_redeem_unixtime'] = pd.Series([y.timestamp() for y in 
                                                 [datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in 
                                                  df_clients['first_issue_date']]]).values
df_clients['first_issue_unixtime'] = pd.Series([x.timestamp() for x in 
                                                [datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in 
                                                 df_clients['first_redeem_date'].fillna('1990-01-01 01:01:01')]]).values
df_clients['issue_redeem_delay'] = df_clients['first_redeem_unixtime'] - df_clients['first_issue_unixtime']

In [5]:
last_cols = ['regular_points_received', 'express_points_received','regular_points_spent', 'express_points_spent', 'purchase_sum','store_id']
all_hist = df_purchases.groupby(['client_id','transaction_id'])[last_cols].last()
last_month = df_purchases[df_purchases['transaction_datetime'] > '2019-02-18'].groupby(['client_id','transaction_id'])[last_cols].last()

features =  pd.concat([all_hist.groupby('client_id')['purchase_sum'].count(),
                       last_month.groupby('client_id')['purchase_sum'].count(),
                       all_hist.groupby('client_id').sum(),
                       all_hist.groupby('client_id')[['store_id']].nunique(),
                       last_month.groupby('client_id').sum(),
                       last_month.groupby('client_id')[['store_id']].nunique(),
                      ],axis = 1)
features.columns = ['total_trans_count','last_month_trans_count']+list(c+"_sum_all" for c in last_cols)+list(c+"_sum_last_month" for c in last_cols)
df_clients[list(features.columns)] = features

In [6]:
temp_df_clients = pd.read_csv('data/clients.csv', index_col='client_id', parse_dates=['first_issue_date','first_redeem_date'])
df_clients['first_issue_date_weekday'] = temp_df_clients['first_issue_date'].dt.weekday
df_clients['first_redeem_date_weekday'] = temp_df_clients['first_redeem_date'].dt.weekday
df_clients['first_issue_date_hour'] = temp_df_clients['first_issue_date'].dt.hour
df_clients['first_redeem_date_hour'] = temp_df_clients['first_redeem_date'].dt.hour
del temp_df_clients, all_hist, last_month

In [7]:
# Часы, в которые клиенты совершали покупки
df_purchases['transaction_hour'] = pd.Series([x.hour for x in 
           [datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in 
            df_purchases['transaction_datetime']]])
df_temp = df_purchases[['client_id','transaction_hour']].groupby(['client_id','transaction_hour']).nunique()
users, hours, nums = [], [], []
for i in range(0, len(df_temp.index)):
    users.append(df_temp.index[i][0])
    hours.append(df_temp.index[i][1])
    nums.append(df_temp.values[i][0])
boughts_by_hours = pd.DataFrame(data=zip(users, hours, nums), columns=['client_id', 'hours', 'nums'])
boughts_by_hours_dum = pd.get_dummies(boughts_by_hours['hours'])
boughts_by_hours_dum['client_id'] = boughts_by_hours['client_id']
boughts_by_hours_dum = boughts_by_hours_dum.groupby(['client_id']).sum().astype(int)
boughts_by_hours_dum.columns = ['0h', '1h', '2h', '3h', '4h', '5h', '6h', '7h', '8h', '9h', '10h', '11h', '12h', '13h', 
                                '14h', '15h', '16h', '17h', '18h', '19h', '20h', '21h', '22h', '23h']
df_clients = pd.concat([df_clients, boughts_by_hours_dum], join='outer', axis = 1)

In [8]:
# Дни, в которые клиенты совершали покупки

df_purchases['transaction_day'] = pd.Series([x.weekday() for x in 
           [datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in 
            df_purchases['transaction_datetime']]])
df_temp = df_purchases[['client_id','transaction_day']].groupby(['client_id','transaction_day']).nunique()
users, days, nums = [], [], []
for i in range(0, len(df_temp.index)):
    users.append(df_temp.index[i][0])
    days.append(df_temp.index[i][1])
    nums.append(df_temp.values[i][0])
boughts_by_days = pd.DataFrame(data=zip(users, days, nums), columns=['client_id', 'days', 'nums'])
boughts_by_days_dum = pd.get_dummies(boughts_by_days['days'])
boughts_by_days_dum['client_id'] = boughts_by_days['client_id']
boughts_by_days_dum = boughts_by_days_dum.groupby(['client_id']).sum().astype('category')
boughts_by_days_dum.columns = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
df_clients = pd.concat([df_clients, boughts_by_days_dum], join='outer', axis = 1)

In [9]:
del users, hours, days, nums, boughts_by_hours, boughts_by_hours_dum, boughts_by_days, boughts_by_days_dum

In [10]:
# Конверт дат в юникс
dt_list = [datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in df_purchases['transaction_datetime']]
df_purchases['transaction_datetime'] = pd.Series([y.timestamp() for y in dt_list]).values
del dt_list

In [11]:
# Пол в категорию
df_clients['gender_M'] = (df_clients['gender'] == 'M').astype('category')
df_clients['gender_F'] = (df_clients['gender'] == 'F').astype('category')
df_clients['gender_U'] = (df_clients['gender'] == 'U').astype('category')

In [12]:
# Количество товаров куплено клиентом
df_clients['qty_items'] = df_purchases[['client_id', 'transaction_id']].groupby(['client_id']).count()
# Потрачено клиентом
df_clients['amount'] = df_purchases[['client_id', 'purchase_sum']].groupby(['client_id']).sum()
# Количество покупок клиентом 
df_clients['n_purchases'] = df_purchases.groupby(['client_id']).transaction_id.nunique()
# Количество магазинов, посещенных клиентом 
df_clients['n_shops'] = df_purchases.groupby(['client_id']).store_id.nunique()
# Максимальная сумма чека клиента
df_clients['max_purch_sum'] = df_purchases.groupby(['client_id']).purchase_sum.max()
# Минимальная сумма чека клиента
df_clients['min_purch_sum'] = df_purchases.groupby(['client_id']).purchase_sum.min().fillna(1).replace(0, 1)

In [13]:
# Наличие алкоголя в чеке
df_clients['has_alco'] = (pd.merge(df_purchases, df_products[['product_id', 'is_alcohol']], on = 'product_id', 
                                   sort=False).groupby(['client_id']).is_alcohol.sum()>0).astype('category')
# Средний чек покупателя
df_clients['avg_check'] = df_clients['amount'] / df_clients['n_purchases']
# Периодичность покупок
df_clients['purch_freq'] = (df_purchases.groupby(['client_id']).transaction_datetime.max() - 
                            df_purchases.groupby(['client_id']).transaction_datetime.min())/df_clients['n_purchases']
# Наличие алкоголя и товаров своей марки в чеке
df_clients['has_alco_and_own'] = ((pd.merge(df_purchases, df_products[['product_id', 'is_alcohol']], on = 'product_id', 
                                   sort=False).groupby(['client_id']).is_alcohol.sum()>0) & 
                                 (pd.merge(df_purchases, df_products[['product_id', 'is_own_trademark']], on = 'product_id', 
                                   sort=False).groupby(['client_id']).is_own_trademark.mean()>0.5)).astype('category')

In [14]:
# Возраст покупателя
df_clients['age_error'] = ((df_clients['age'] <= 10) & (df_clients['age'] > 99)).astype('category')
df_clients['age_1'] = ((df_clients['age'] >10) & (df_clients['age'] <= 18)).astype('category')
df_clients['age_2'] = ((df_clients['age'] >18) & (df_clients['age'] <= 25)).astype('category')
df_clients['age_3'] = ((df_clients['age'] >25) & (df_clients['age'] <= 35)).astype('category')
df_clients['age_4'] = ((df_clients['age'] >35) & (df_clients['age'] <= 45)).astype('category')
df_clients['age_5'] = ((df_clients['age'] >45) & (df_clients['age'] <= 55)).astype('category')
df_clients['age_6'] = ((df_clients['age'] >55) & (df_clients['age'] <= 99)).astype('category')

In [15]:
# Категирии по суммам и количеству купленных наименований
df_clients['client_cat_1'] = (df_clients['qty_items'] <= 45).astype('category')
df_clients['client_cat_2'] = ((df_clients['qty_items'] >45) & (df_clients['qty_items'] <= 85)).astype('category')
df_clients['client_cat_3'] = ((df_clients['qty_items'] >85) & (df_clients['qty_items'] <= 155)).astype('category')
df_clients['client_cat_4'] = (df_clients['qty_items'] > 155).astype('category')

df_clients['client_sum_1'] = (df_clients['min_purch_sum'] <= 50).astype('category')
df_clients['client_sum_2'] = ((df_clients['min_purch_sum'] >50) & (df_clients['min_purch_sum'] <= 150)).astype('category')
df_clients['client_sum_3'] = ((df_clients['min_purch_sum'] >150) & (df_clients['min_purch_sum'] <= 900)).astype('category')
df_clients['client_sum_4'] = (df_clients['min_purch_sum'] > 900).astype('category')
df_clients['client_sum_extra'] = ((df_clients['min_purch_sum'] > 1500) & (df_clients['max_purch_sum'] >15000)).astype('category')

df_clients['sums_relation'] = df_clients['max_purch_sum'] / df_clients['min_purch_sum']

In [16]:
# Любимые и нелюбимые категории товаров
little_df = pd.merge(df_purchases[['client_id', 'product_id']], df_products[['product_id', 'segment_id']], 
                     on = 'product_id', sort=True).groupby(['client_id', 'segment_id']).count().reset_index()
little_df = little_df.sort_values(by = ['client_id', 'product_id'],ascending=False)
little_df_max = little_df.groupby(['client_id']).max().to_dict()
little_df_min = little_df.groupby(['client_id']).min().to_dict()

little_df_max = pd.DataFrame.from_dict(little_df_max, orient='index').transpose()
little_df_min = pd.DataFrame.from_dict(little_df_min, orient='index').transpose()

df_clients['client_fav_cat'] = little_df_max.segment_id
df_clients['client_unfav_cat'] = little_df_min.segment_id
del little_df, little_df_max, little_df_min

In [17]:
df_clients['regular_points_received'] = df_purchases[['client_id', 'transaction_id', 'regular_points_received']].groupby(['client_id']).sum()
df_clients['regular_points_spent'] = df_purchases[['client_id', 'transaction_id', 'regular_points_spent']].groupby(['client_id']).sum().replace(0, 1)
df_clients['regular_points_balance'] = df_clients['regular_points_received'] + df_clients['regular_points_spent']
df_clients['bonuses'] = round(df_clients['amount']/df_clients['regular_points_spent'], 2)
df_clients['max_product_quantity'] = df_purchases[['client_id', 'product_quantity']].groupby(['client_id']).max()

In [18]:
df_clients['express_points_received'] = df_purchases[['client_id', 'transaction_id', 'express_points_received']].groupby(['client_id']).sum()
df_clients['express_points_spent'] = df_purchases[['client_id', 'transaction_id', 'express_points_spent']].groupby(['client_id']).sum().replace(0, 1)
df_clients['express_points_balance'] = df_clients['express_points_received'] + df_clients['express_points_spent']
df_clients['express_bonuses'] = round(df_clients['express_points_spent']/df_clients['amount'], 2)

In [20]:
# Возможные социальные категории
df_clients['possible_pension']=((((df_clients['7h'])+(df_clients['8h'])+(df_clients['9h'])+(df_clients['10h'])+
                                  (df_clients['11h'])+(df_clients['12h']))>0) & (df_clients['age']>60)).astype('category')
df_clients['possible_worker']=(((df_clients['17h']+df_clients['18h']+df_clients['19h']+df_clients['20h']+
                                 df_clients['21h'])>0) & (df_clients['age']>16)).astype('category')
df_clients['possible_hard_worker']=(((df_clients['1h']+df_clients['2h']+df_clients['3h']+df_clients['23h']+
                                      df_clients['22h']+df_clients['21h'])>0) & (df_clients['age']>20) & 
                                    (df_clients['has_alco'].astype(int))>0).astype('category')
df_clients['shops_for_weekend'] = (((df_clients['fri'].astype(int)+df_clients['sat'].astype(int))>1) & 
                                   df_clients['has_alco'].astype(int)>0).astype('category')
df_clients['shops_for_week'] = ((df_clients['fri'].astype(int)+df_clients['sat'].astype(int)+
                                 df_clients['sun'].astype(int))==1).astype('category')

## Нормирование признаков

In [21]:
# Нормирую время
df_clients['norm_first_issue_unixtime'] = df_clients['first_issue_unixtime']/df_clients['first_issue_unixtime'].values.mean()
df_clients['norm_first_redeem_unixtime'] = df_clients['first_redeem_unixtime']/df_clients['first_redeem_unixtime'].values.mean()

In [22]:
# Потрачено клиентом нормированное
df_clients['norm_amount'] = df_clients['amount']/df_clients['amount'].values.mean()

In [23]:
# Количество магазинов, посещенных клиентом и чек нормированные
df_clients['norm_n_shops'] = np.log(df_clients['n_shops'])
df_clients['norm_avg_check'] = np.log(df_clients['avg_check'])
df_clients['norm_qty_items'] = np.log(df_clients['qty_items'])
df_clients['norm_min_purch_sum'] = np.log(df_clients['min_purch_sum'])

In [35]:
features_to_operate = ['total_trans_count', 'last_month_trans_count', 'regular_points_received_sum_all', 
                       'express_points_received_sum_all', 'regular_points_spent_sum_all', 'express_points_spent_sum_all', 
                       'purchase_sum_sum_all', 'purchase_sum_sum_all', 'regular_points_received_sum_last_month', 
                       'express_points_received_sum_last_month', 'regular_points_spent_sum_last_month', 
                       'express_points_spent_sum_last_month', 'purchase_sum_sum_last_month', 'qty_items', 
                       'amount', 'n_purchases', 'max_purch_sum', 'min_purch_sum', 'avg_check', 'sums_relation', 
                       'regular_points_received', 'regular_points_spent', 'regular_points_balance', 'bonuses', 
                       'max_product_quantity', 'express_points_received', 'express_points_spent', 'express_points_balance',
                       'express_bonuses']

In [37]:
for i in range(0, len(features_to_operate)):
    for j in range(0, len(features_to_operate)):
        if i!=j:
            df_clients[features_to_operate[i] + '_sum_' + features_to_operate[j]] = (df_clients[features_to_operate[i]] + df_clients[features_to_operate[j]])
            df_clients[features_to_operate[i] + '_multi_' + features_to_operate[j]] = (df_clients[features_to_operate[i]] * df_clients[features_to_operate[j]])

In [39]:
# Экспорт на случай если все рухнет
df_clients.to_csv('D:/df_clients.csv')
datetime.datetime.now()-start

datetime.timedelta(seconds=5802, microseconds=691259)

In [40]:
# Освобождаю память
del df_products
del df_purchases


# ДО СЮДА

In [3]:
# Чтение после обвала
df_clients = pd.read_csv('D:/df_clients.csv', index_col='client_id')

In [4]:
# Удалить нормированные признаки
# features = list(df_clients.columns[2:-7].values) # удаляю 'first_issue_date', 'first_redeem_date'
features = list(df_clients.columns[2:].values) # удаляю 'first_issue_date', 'first_redeem_date'
#df_clients[features].fillna(0)
features.remove('gender')

# Выбор признаков

## Отбираем важные для модели фичи

In [None]:
# Отбираем важные для модели фичи
import feature_selector
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)

In [None]:
from feature_selector import FeatureSelector
# Для treatment_flg
fs = FeatureSelector(df_clients[features].loc[indices_learn, :].fillna(0), df_train.loc[indices_learn, 'treatment_flg'])

In [None]:
# признаки, имеющие более 60% пропусков
fs.identify_missing(missing_threshold = 0.01)

In [None]:
# доли незаполненных значений для каждого параметра
fs.missing_stats.max()

In [None]:
# признаки для удаления
fs.ops['missing']

In [None]:
fs.identify_collinear(correlation_threshold = 0.99)

# признаки коллинеарные
fs.record_collinear

In [None]:
# Heatmap
fs.plot_collinear(plot_all=True)

In [None]:
# Для target
fs = FeatureSelector(df_clients[features].loc[indices_learn, :].fillna(0), df_train.loc[indices_learn, 'target'])
# Oтбор признаков с нулевой важностью для target
fs.identify_zero_importance(task = 'classification', 
                            eval_metric = 'auc', 
                            n_iterations = 10)

# Hормализованные показатели важности plot_n самых значимых признаков
fs.plot_feature_importances(threshold = 0.99)

In [None]:
to_delete = list(fs.ops['zero_importance'])
to_delete

In [16]:
for element in to_delete:
    features.remove(element)

## Исследуем фичи

# МОЖНО ПРОПУСТИТЬ

In [None]:
# Аномалий во взаимосвязях не выявлено
for i in range(0, len(features)):
    for j in range(0, len(features)):
        plt.rcParams['figure.figsize'] = (10,8)
        plt.scatter(df_clients[features[i]].loc[indices_train[:200000]], df_clients[features[j]].loc[indices_train[:200000]],
               c='green', s=3)
        plt.scatter(df_clients[features[i]].loc[indices_test[:200000]], df_clients[features[j]].loc[indices_test[:200000]],
               c='red', s=2)
        plt.ylabel(features[i])
        plt.xlabel(features[j])
        plt.show()

In [None]:
indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)

for i in range(4, len(features)):
    plt.rcParams['figure.figsize'] = (10,8)
    plt.title('Распределение целевого признака ' + str(features[i]) + 'для групп покупателей')
    plt.subplot(2, 2, 1)
    plt.hist(df_clients[features[i]].loc[indices_train], color='green')
    plt.subplot(2, 2, 2)
    plt.hist(df_clients[features[i]].loc[indices_test], color='red')
    plt.title('Распределение целевого признака ' + str(features[i]) + ' для групп покупателей')
    plt.show()

# Построение предсказания одним алгоритмом для 0 и 1
## GradientBoostingClassifier

In [None]:
# Оценка качества на валидации по подобранным n_estimators = 170, max_depth = 1

indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)

valid_uplift = uplift_fit_predict(
    model=GradientBoostingClassifier(n_estimators = 170, max_depth = 1),
    X_train=df_clients[features].loc[indices_learn, :].fillna(0).values,
    treatment_train=df_train.loc[indices_learn, 'treatment_flg'].values,
    target_train=df_train.loc[indices_learn, 'target'].values,
    X_test=df_clients[features].loc[indices_valid, :].fillna(0).values,
)
valid_score = uplift_score(
    valid_uplift,
    treatment=df_train.loc[indices_valid, 'treatment_flg'].values,
    target=df_train.loc[indices_valid, 'target'].values,
)
print('Validation score:', valid_score)

#Max validation score: 0.08205685972385873 = # Public: 0,0908
# 0.07510360333141991 = 0,0788
# 0.0767923502220611 = 0,0837
# 0.07512500377654918 = 0,0859

In [None]:
# Подготовка предсказаний для тестовых клиентов

test_uplift = uplift_fit_predict(
    model=GradientBoostingClassifier(n_estimators = 170, max_depth = 1),
    X_train=df_clients[features].loc[indices_train, :].fillna(0).values,
    treatment_train=df_train.loc[indices_train, 'treatment_flg'].values,
    target_train=df_train.loc[indices_train, 'target'].values,
    X_test=df_clients[features].loc[indices_test, :].fillna(0).values,
)

df_submission = pd.DataFrame({'uplift': test_uplift}, index=df_test.index)
df_submission.to_csv('submission34.csv')


## AdaBoostClassifier

In [None]:
# Оценка качества на валидации по ADABOOST по подобранным learning_rate = 0.1, n_estimators = 100

indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)

valid_uplift = uplift_fit_predict(
    model=AdaBoostClassifier(learning_rate = 0.1, n_estimators = 100, n_jobs=-1),
    X_train=df_clients[features].loc[indices_learn, :].fillna(0).values,
    treatment_train=df_train.loc[indices_learn, 'treatment_flg'].values,
    target_train=df_train.loc[indices_learn, 'target'].values,
    X_test=df_clients[features].loc[indices_valid, :].fillna(0).values,
)
valid_score = uplift_score(
    valid_uplift,
    treatment=df_train.loc[indices_valid, 'treatment_flg'].values,
    target=df_train.loc[indices_valid, 'target'].values,
)
print('Validation score:', valid_score)

# Max validation score: 0.08268628458060162 = # Public: 0,0803
# 0.07839801303161226 = 0,0765
# 0.07623342094927343 = 0,0767

In [None]:
# Подготовка предсказаний для тестовых клиентов

test_uplift = uplift_fit_predict(
    model=AdaBoostClassifier(learning_rate = 0.1, n_estimators = 100, n_jobs=-1),
    X_train=df_clients[features].loc[indices_train, :].fillna(0).values,
    treatment_train=df_train.loc[indices_train, 'treatment_flg'].values,
    target_train=df_train.loc[indices_train, 'target'].values,
    X_test=df_clients[features].loc[indices_test, :].fillna(0).values,
)

df_submission = pd.DataFrame({'uplift': test_uplift}, index=df_test.index)
df_submission.to_csv('submission35.csv')

## LGBMClassifier

In [None]:
# Оценка качества на валидации по LGBM

import lightgbm as lgbm
indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)

valid_uplift = uplift_fit_predict(
    model=lgbm.LGBMClassifier(learning_rate = 0.03, max_depth = 5, num_leaves = 20,
             min_data_in_leaf = 3, application = 'binary',
             subsample = 0.8, colsample_bytree = 0.8,
             reg_alpha = 0.01,data_random_seed = 42,metric = 'binary_logloss',
             max_bin = 416,bagging_freq = 3,reg_lambda = 0.01, n_estimators = 1000),
    X_train=df_clients[features].loc[indices_learn, :].fillna(0).values,
    treatment_train=df_train.loc[indices_learn, 'treatment_flg'].values,
    target_train=df_train.loc[indices_learn, 'target'].values,
    X_test=df_clients[features].loc[indices_valid, :].fillna(0).values,
)
valid_score = uplift_score(
    valid_uplift,
    treatment=df_train.loc[indices_valid, 'treatment_flg'].values,
    target=df_train.loc[indices_valid, 'target'].values,
)
print('Validation score:', valid_score)

# Max validation score: 0.058409997280884585 = 0,0745
# 0.058173962959605974 = 0,0819
# 0.06286884296605144 = 0,0773

In [None]:
# Подготовка предсказаний для тестовых клиентов

test_uplift = uplift_fit_predict(
    model=lgbm.LGBMClassifier(learning_rate = 0.03, max_depth = 5, num_leaves = 20,
             min_data_in_leaf = 3, application = 'binary',
             subsample = 0.8, colsample_bytree = 0.8,
             reg_alpha = 0.01,data_random_seed = 42,metric = 'binary_logloss',
             max_bin = 416,bagging_freq = 3,reg_lambda = 0.01, n_estimators = 1000),
    X_train=df_clients[features].loc[indices_train, :].fillna(0).values,
    treatment_train=df_train.loc[indices_train, 'treatment_flg'].values,
    target_train=df_train.loc[indices_train, 'target'].values,
    X_test=df_clients[features].loc[indices_test, :].fillna(0).values,
)

df_submission = pd.DataFrame({'uplift': test_uplift}, index=df_test.index)
df_submission.to_csv('submission36.csv')


## XGBClassifier

In [None]:
# Оценка качества на валидации по XGBClassifier
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier
indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)

valid_uplift = uplift_fit_predict(
    model=XGBClassifier(),
    X_train=df_clients[features].loc[indices_learn, :].fillna(0).values,
    treatment_train=df_train.loc[indices_learn, 'treatment_flg'].values,
    target_train=df_train.loc[indices_learn, 'target'].values,
    X_test=df_clients[features].loc[indices_valid, :].fillna(0).values,
)
valid_score = uplift_score(
    valid_uplift,
    treatment=df_train.loc[indices_valid, 'treatment_flg'].values,
    target=df_train.loc[indices_valid, 'target'].values,
)
print('Validation score:', valid_score)

# 0.06186994571840043 = 0,0918
# 0.06186994571840043 = 
# 0.06186994571840043

In [None]:
# Подготовка предсказаний для тестовых клиентов

test_uplift = uplift_fit_predict(
    model=XGBClassifier(),
    X_train=df_clients[features].loc[indices_train, :].fillna(0).values,
    treatment_train=df_train.loc[indices_train, 'treatment_flg'].values,
    target_train=df_train.loc[indices_train, 'target'].values,
    X_test=df_clients[features].loc[indices_test, :].fillna(0).values,
)

df_submission = pd.DataFrame({'uplift': test_uplift}, index=df_test.index)
df_submission.to_csv('submission37.csv')


# Подбор гиперпараметров для моделей по отдельности

In [None]:
indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)
X_train=df_clients[features].loc[indices_learn, :].fillna(0).values
treatment_train=df_train.loc[indices_learn, 'treatment_flg'].values
target_train=df_train.loc[indices_learn, 'target'].values
X_test=df_clients[features].loc[indices_valid, :].fillna(0).values
X_treatment, y_treatment = X_train[treatment_train == 1, :], target_train[treatment_train == 1]
X_control, y_control = X_train[treatment_train == 0, :], target_train[treatment_train == 0]

## XGBClassifier

In [None]:
from xgboost import XGBClassifier
parameters = {'max_depth': range(1, 6), 
              'tree_method': ['auto', 'exact', 'approx'],# 'hist', 'gpu_hist'],
              'updater': ['grow_colmaker', 'distcol', 'grow_histmaker'], # 'grow_local_histmaker', # 'grow_skmaker', 'grow_quantile_histmaker', 'sync', 'refresh'], 
              'eta': np.arange(0.01, 0.21, 0.02),
              'num_parallel_tree': [1, 2, 3]}

model=XGBClassifier()

clf = GridSearchCV(model, parameters, cv = 3, verbose=True)
model_treatment_XGBC = clf.fit(X_treatment, y_treatment)

In [None]:
model_control_XGBC = clf.fit(X_control, y_control)

In [None]:
model_treatment_XGBC.best_params_, model_treatment_XGBC.best_score_

In [None]:
model_control_XGBC.best_params_, model_control_XGBC.best_score_

## GradientBoostingClassifier

In [None]:
indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)
X_train=df_clients[features].loc[indices_learn, :].fillna(0).values
treatment_train=df_train.loc[indices_learn, 'treatment_flg'].values
target_train=df_train.loc[indices_learn, 'target'].values
X_test=df_clients[features].loc[indices_valid, :].fillna(0).values
X_treatment, y_treatment = X_train[treatment_train == 1, :], target_train[treatment_train == 1]
X_control, y_control = X_train[treatment_train == 0, :], target_train[treatment_train == 0]

In [None]:
parameters = {'n_estimators': range(150, 250, 10),
              'criterion': ['friedman_mse', 'mse', 'mae'],
              'loss': ['deviance', 'exponential'],
              'max_depth': range(1, 4), 
              'learning_rate': np.arange(0.05, 0.14, 0.02), 
              'min_samples_leaf':[1, 2]}

clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv = 3, n_jobs = -1, verbose=True)
model_treatment_gbc = clf.fit(X_treatment, y_treatment)

In [None]:
model_control_gbc = clf.fit(X_control, y_control)
# в каггле

In [None]:
model_treatment_gbc.best_params_, model_treatment_gbc.best_score_

In [None]:
model_control_gbc.best_params_, model_control_gbc.best_score_

## AdaBoostClassifier

In [None]:
parameters = {'n_estimators': range(40, 250, 10), 
              'learning_rate': np.arange(0.01, 0.21, 0.02)}

model=AdaBoostClassifier()

clf = GridSearchCV(model, parameters, cv = 3, n_jobs = -1, verbose=True)
model_treatment_adaboost = clf.fit(X_treatment, y_treatment)

In [None]:
model_control_adaboost = clf.fit(X_control, y_control)

In [None]:
model_treatment_adaboost.best_params_, model_treatment_adaboost.best_score_

In [None]:
model_control_adaboost.best_params_, model_control_adaboost.best_score_

## LGBMClassifier

In [None]:
import lightgbm as lgbm
parameters = {'max_depth': range(1, 4), 
              'n_estimators': range(150, 250, 10),
              'learning_rate': np.arange(0.01, 0.21, 0.02)}

model=lgbm.LGBMClassifier()

clf = GridSearchCV(model, parameters, cv = 3, n_jobs = -1, verbose=True)
model_treatment_LGBM = clf.fit(X_treatment, y_treatment)

In [None]:
model_control_LGBM = clf.fit(X_control, y_control)

In [None]:
model_treatment_LGBM.best_params_, model_treatment_LGBM.best_score_

In [None]:
model_control_LGBM.best_params_, model_control_LGBM.best_score_

# Предсказание по 2 моделям, разным для таргета и тритмента

In [8]:
#features = features[:2000]
#features = features[:94]

In [None]:
start=datetime.datetime.now()
indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)

models_treatment = [GradientBoostingClassifier(learning_rate=0.11000000000000001, max_depth=3, min_samples_leaf=1, n_estimators=170),
                    lgbm.LGBMClassifier(learning_rate = 0.06999999999999999, max_depth=3, n_estimators=180),
                    AdaBoostClassifier(learning_rate=0.18999999999999997, n_estimators=230),
                    XGBClassifier(eta=0.01, max_depth=4)]

models_control = [GradientBoostingClassifier(learning_rate=0.05, max_depth=3, min_samples_leaf=1, n_estimators=240),
                  lgbm.LGBMClassifier(learning_rate=0.06999999999999999, max_depth=3, n_estimators=180),
                  AdaBoostClassifier(learning_rate=0.10999999999999997, n_estimators=230),
                  XGBClassifier(eta=0.01, max_depth=4)]

from tqdm import tqdm 
i=0
for model_treatment in tqdm(models_treatment):
    for model_control in tqdm(models_control):
        test_uplift = uplift_fit_predict_2_models(model_treatment, model_control,
            X_train=df_clients[features].loc[indices_train, :].fillna(0).values,
            treatment_train=df_train.loc[indices_train, 'treatment_flg'].values,
            target_train=df_train.loc[indices_train, 'target'].values,
            X_test=df_clients[features].loc[indices_test, :].fillna(0).values)
        
        df_submission = pd.DataFrame({'uplift': test_uplift}, index=df_test.index)
        df_submission.to_csv('отправить/3(фичикатегории)/submission3000_'+str(i)+'.csv')
        i += 1
        
print(datetime.datetime.now()-start)

  0%|                                                    | 0/4 [00:00<?, ?it/s]
  0%|                                                    | 0/4 [00:00<?, ?it/s]

# Все 16 решений отравлены, выбираю 4 лучших и комбинирую

## Комбинация из 4 решений

In [None]:
godn_submission1 = pd.read_csv(r'отправить/отправлено/submission1.csv', index_col='client_id')
godn_submission13 = pd.read_csv(r'отправить/отправлено/submission13.csv', index_col='client_id')
godn_submission2_0 = pd.read_csv(r'отправить/отправлено/submission2_0.csv', index_col='client_id')
godn_submission37 = pd.read_csv(r'отправить/отправлено/submission37.csv', index_col='client_id')

In [None]:
effective_subs = pd.concat([godn_submission1, godn_submission13, godn_submission2_0, godn_submission37], join='outer', axis = 1)
effective_subs['mean'] = effective_subs.mean(axis=1)
effective_subs['max'] = effective_subs.iloc[:, 0:4].max(axis=1)
effective_subs['min'] = effective_subs.iloc[:, 0:4].min(axis=1)
effective_subs['abs_max'] = effective_subs.iloc[:, 0:4].abs().max(axis=1)
effective_subs['abs_min'] = effective_subs.iloc[:, 0:4].abs().min(axis=1)
for i in range(0, 5):
    effective_subs.iloc[:, 4+i].to_csv(r'отправить/отправлено/submissions_compositions'+str(i)+'.csv')
    i+=1

In [None]:
effective_subs = pd.concat([godn_submission1, godn_submission13, godn_submission2_0], join='outer', axis = 1)
effective_subs['mean'] = effective_subs.mean(axis=1)
effective_subs['max'] = effective_subs.iloc[:, 0:3].max(axis=1)
effective_subs['min'] = effective_subs.iloc[:, 0:3].min(axis=1)
effective_subs['abs_max'] = effective_subs.iloc[:, 0:3].abs().max(axis=1)
effective_subs['abs_min'] = effective_subs.iloc[:, 0:3].abs().min(axis=1)
for i in range(0, 5):
    effective_subs.iloc[:, 3+i].to_csv(r'отправить/submissions_compositions_2'+str(i)+'.csv')
    i+=1

Лучше всего зашли минимум, максимум и среднее по 4м лучшим решениям

## Комбинации из 3 решений

In [None]:
effective_subs = pd.concat([godn_submission1, godn_submission13, godn_submission37], join='outer', axis = 1)
effective_subs['mean'] = effective_subs.mean(axis=1)
effective_subs['max'] = effective_subs.iloc[:, 0:3].max(axis=1)
effective_subs['min'] = effective_subs.iloc[:, 0:3].min(axis=1)

for i in range(0, 3):
    effective_subs.iloc[:, 3+i].to_csv(r'отправить/submissions_compositions_3'+str(i)+'.csv')
    i+=1
# 0,0912, 0,0861, 0,0928

In [None]:
effective_subs = pd.concat([godn_submission1, godn_submission2_0, godn_submission37], join='outer', axis = 1)
effective_subs['mean'] = effective_subs.mean(axis=1)
effective_subs['max'] = effective_subs.iloc[:, 0:3].max(axis=1)
effective_subs['min'] = effective_subs.iloc[:, 0:3].min(axis=1)

for i in range(0, 3):
    effective_subs.iloc[:, 3+i].to_csv(r'отправить/submissions_compositions_4'+str(i)+'.csv')
    i+=1
# 0,0941 0,0878 0,0927

In [None]:
effective_subs = pd.concat([godn_submission13, godn_submission2_0, godn_submission37], join='outer', axis = 1)
effective_subs['mean'] = effective_subs.mean(axis=1)
effective_subs['max'] = effective_subs.iloc[:, 0:3].max(axis=1)
effective_subs['min'] = effective_subs.iloc[:, 0:3].min(axis=1)

for i in range(0, 3):
    effective_subs.iloc[:, 3+i].to_csv(r'отправить/submissions_compositions_5'+str(i)+'.csv')
    i+=1

## Комбинации из 2 решений

In [None]:
effective_subs = pd.concat([godn_submission1, godn_submission13], join='outer', axis = 1)
effective_subs['mean'] = effective_subs.mean(axis=1)
effective_subs['max'] = effective_subs.iloc[:, 0:2].max(axis=1)
effective_subs['min'] = effective_subs.iloc[:, 0:2].min(axis=1)

for i in range(0, 3):
    effective_subs.iloc[:, 2+i].to_csv(r'отправить/submissions_compositions_6'+str(i)+'.csv')
    i+=1

In [None]:
effective_subs = pd.concat([godn_submission1, godn_submission2_0], join='outer', axis = 1)
effective_subs['mean'] = effective_subs.mean(axis=1)
effective_subs['max'] = effective_subs.iloc[:, 0:2].max(axis=1)
effective_subs['min'] = effective_subs.iloc[:, 0:2].min(axis=1)

for i in range(0, 3):
    effective_subs.iloc[:, 2+i].to_csv(r'отправить/submissions_compositions_7'+str(i)+'.csv')
    i+=1

In [None]:
effective_subs = pd.concat([godn_submission1, godn_submission37], join='outer', axis = 1)
effective_subs['mean'] = effective_subs.mean(axis=1)
effective_subs['max'] = effective_subs.iloc[:, 0:2].max(axis=1)
effective_subs['min'] = effective_subs.iloc[:, 0:2].min(axis=1)

for i in range(0, 3):
    effective_subs.iloc[:, 2+i].to_csv(r'отправить/submissions_compositions_8'+str(i)+'.csv')
    i+=1

In [None]:
effective_subs = pd.concat([godn_submission13, godn_submission2_0], join='outer', axis = 1)
effective_subs['mean'] = effective_subs.mean(axis=1)
effective_subs['max'] = effective_subs.iloc[:, 0:2].max(axis=1)
effective_subs['min'] = effective_subs.iloc[:, 0:2].min(axis=1)

for i in range(0, 3):
    effective_subs.iloc[:, 2+i].to_csv(r'отправить/submissions_compositions_9'+str(i)+'.csv')
    i+=1

In [None]:
effective_subs = pd.concat([godn_submission13, godn_submission37], join='outer', axis = 1)
effective_subs['mean'] = effective_subs.mean(axis=1)
effective_subs['max'] = effective_subs.iloc[:, 0:2].max(axis=1)
effective_subs['min'] = effective_subs.iloc[:, 0:2].min(axis=1)

for i in range(0, 3):
    effective_subs.iloc[:, 2+i].to_csv(r'отправить/submissions_compositions_10'+str(i)+'.csv')
    i+=1

In [None]:
effective_subs = pd.concat([godn_submission2_0, godn_submission37], join='outer', axis = 1)
effective_subs['mean'] = effective_subs.mean(axis=1)
effective_subs['max'] = effective_subs.iloc[:, 0:2].max(axis=1)
effective_subs['min'] = effective_subs.iloc[:, 0:2].min(axis=1)

for i in range(0, 3):
    effective_subs.iloc[:, 2+i].to_csv(r'отправить/submissions_compositions_11'+str(i)+'.csv')
    i+=1

# 3 решения с лучшим паблик скором объединяю по абсолютному значению (скор не побил)

In [None]:
best_1 = pd.read_csv(r'отправить/submissions_compositions0.csv', index_col='client_id')
best_2 = pd.read_csv(r'отправить/submissions_compositions_102.csv', index_col='client_id')
best_3 = pd.read_csv(r'отправить/submissions_compositions_111.csv', index_col='client_id')
best_df = pd.concat([best_1, best_2, best_3], join='outer', axis = 1) 

In [None]:
best_df = pd.DataFrame({'uplift': list(max_absolute_value_3(best_df))}, index=best_1.index)

In [None]:
best_df.to_csv(r'отправить/best_df.csv')
# 0,0939

# ДО СЮДА

In [18]:
best_1 = pd.read_csv(r'отправить/3 лучших/submissions_compositions0.csv', index_col='client_id')
best_2 = pd.read_csv(r'отправить/3 лучших/submissions_compositions_102.csv', index_col='client_id')
best_3 = pd.read_csv(r'отправить/3 лучших/submissions_compositions_111.csv', index_col='client_id')
best_df = pd.concat([best_1, best_2, best_3], join='outer', axis = 1) 

# 3 решения с лучшим паблик скором запускаю как признаки

In [19]:
df_clients['Predict_1'] = best_1
df_clients['Predict_2'] = best_2
df_clients['Predict_3'] = best_3

In [20]:
features = features + ['Predict_1','Predict_2','Predict_3']
df_clients[features].fillna(0)

Unnamed: 0_level_0,age,first_redeem_unixtime,first_issue_unixtime,issue_redeem_delay,total_trans_count,last_month_trans_count,regular_points_received_sum_all,express_points_received_sum_all,regular_points_spent_sum_all,express_points_spent_sum_all,...,norm_qty_items,norm_min_purch_sum,possible_pension,possible_worker,possible_hard_worker,shops_for_weekend,shops_for_week,Predict_1,Predict_2,Predict_3
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000012768d,45,1.501937e+09,1.515083e+09,-13146559.0,4,2,25.7,0.0,0.0,0.0,...,3.951244,6.037871,0,0,0,0,0,0.000000,0.000000,0.000000
000036f903,72,1.491822e+09,1.492940e+09,-1118613.0,32,8,54.9,60.0,0.0,0.0,...,5.087596,4.060443,1,0,0,1,0,0.000000,0.000000,0.000000
000048b7a6,68,1.544870e+09,6.311449e+08,913725130.0,8,1,26.5,0.0,0.0,0.0,...,4.025352,4.727388,1,0,0,0,0,-0.015392,-0.027889,-0.009250
000073194a,60,1.495533e+09,1.511511e+09,-15978107.0,17,6,74.9,0.0,-96.0,0.0,...,4.406719,4.873975,0,0,0,1,0,0.039149,0.034640,0.037661
00007c7133,67,1.495459e+09,1.546266e+09,-50806825.0,11,1,56.6,0.0,-240.0,0.0,...,4.418841,5.765191,1,1,0,0,0,0.046945,0.044241,0.054794
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffece623e,67,1.526213e+09,1.545927e+09,-19713429.0,24,5,38.4,0.0,-68.0,0.0,...,4.762174,3.367296,1,1,0,1,0,0.039210,0.027638,0.050383
ffff3dfff8,56,1.541078e+09,1.544193e+09,-3115615.0,17,5,117.9,0.0,-182.0,0.0,...,4.564348,2.708050,0,1,0,1,0,0.044752,0.040587,0.049524
ffffaab9da,23,1.503676e+09,1.512745e+09,-9068995.0,7,2,34.0,0.0,-6.0,0.0,...,4.094345,5.398163,0,1,0,0,0,0.022525,0.014677,0.063253
ffffeb5619,62,1.512470e+09,6.311449e+08,881324727.0,7,6,117.5,0.0,0.0,0.0,...,4.595120,5.375278,1,1,0,1,0,-0.008957,-0.024093,0.004311


# Предсказание по 2 конкурентным моделям, разным для таргета и тритмента

In [21]:
indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)
X_train=df_clients[features].loc[indices_learn, :].fillna(0).values
treatment_train=df_train.loc[indices_learn, 'treatment_flg'].values
target_train=df_train.loc[indices_learn, 'target'].values
X_test=df_clients[features].loc[indices_valid, :].fillna(0).values
X_treatment, y_treatment = X_train[treatment_train == 1, :], target_train[treatment_train == 1]
X_control, y_control = X_train[treatment_train == 0, :], target_train[treatment_train == 0]


models_treatment = [GradientBoostingClassifier(learning_rate=0.11000000000000001, max_depth=3, min_samples_leaf=1, n_estimators=170),
                    lgbm.LGBMClassifier(learning_rate = 0.06999999999999999, max_depth=3, n_estimators=180),
                    XGBClassifier(eta=0.01, max_depth=4, updater='grow_colmaker')]

models_control = [GradientBoostingClassifier(learning_rate=0.05, max_depth=3, min_samples_leaf=1, n_estimators=240),
                 lgbm.LGBMClassifier(learning_rate=0.06999999999999999, max_depth=3, n_estimators=180),
                  XGBClassifier(eta=0.01, max_depth=4)]

for element in tqdm(['mean', 'min']): #'max_abs', 'max' показали результат хуже
    for i in tqdm(range(0, 3)):
        for j in tqdm(range(0,3)):

            test_uplift = uplift_fit_predict_2_concurrent_models(
                model_treatment_1 = models_treatment[i],
                model_treatment_2 = models_treatment[(i+1)%3], 
                model_control_1 = models_control[j], 
                model_control_2 = models_control[(j+1)%3],
                metric = element,
                X_train=df_clients[features].loc[indices_train, :].fillna(0).values,
                treatment_train=df_train.loc[indices_train, 'treatment_flg'].values,
                target_train=df_train.loc[indices_train, 'target'].values,
                X_test=df_clients[features].loc[indices_test, :].fillna(0).values)

            df_submission = pd.DataFrame(data=test_uplift.values, index=df_test.index)
            df_submission.columns = ['uplift']
            df_submission.to_csv(r'отправить/0/submission_'+element+str(i)+str((i+1)%3)+str(j)+str((j+1)%3)+'.csv')
            print('submission_'+element+str(i)+str((i+1)%3)+str(j)+str((j+1)%3)+'.csv')

  0%|                                                    | 0/2 [00:00<?, ?it/s]
  0%|                                                    | 0/3 [00:00<?, ?it/s]

  0%|                                                    | 0/3 [00:00<?, ?it/s]

submission_mean0101.csv




 33%|██████████████▎                            | 1/3 [05:12<10:24, 312.14s/it]

submission_mean0112.csv




 67%|████████████████████████████▋              | 2/3 [08:05<04:30, 270.59s/it]

submission_mean0120.csv




100%|███████████████████████████████████████████| 3/3 [13:37<00:00, 288.81s/it]
 33%|██████████████▎                            | 1/3 [13:37<27:14, 817.12s/it]

  0%|                                                    | 0/3 [00:00<?, ?it/s]

submission_mean1201.csv




 33%|██████████████▎                            | 1/3 [03:56<07:52, 236.29s/it]

submission_mean1212.csv




 67%|████████████████████████████▋              | 2/3 [05:57<03:21, 201.73s/it]

submission_mean1220.csv




100%|███████████████████████████████████████████| 3/3 [10:32<00:00, 223.67s/it]
 67%|████████████████████████████▋              | 2/3 [24:09<12:41, 761.66s/it]

  0%|                                                    | 0/3 [00:00<?, ?it/s]

submission_mean2001.csv




 33%|██████████████▎                            | 1/3 [05:53<11:46, 353.28s/it]

submission_mean2012.csv




 67%|████████████████████████████▋              | 2/3 [09:37<05:14, 314.49s/it]

submission_mean2020.csv




100%|███████████████████████████████████████████| 3/3 [16:14<00:00, 339.44s/it]
 50%|█████████████████████                     | 1/2 [40:24<40:24, 2424.30s/it]
  0%|                                                    | 0/3 [00:00<?, ?it/s]

  0%|                                                    | 0/3 [00:00<?, ?it/s]

submission_min0101.csv




 33%|██████████████▎                            | 1/3 [05:16<10:32, 316.07s/it]

submission_min0112.csv




 67%|████████████████████████████▋              | 2/3 [08:23<04:37, 277.55s/it]

submission_min0120.csv




100%|███████████████████████████████████████████| 3/3 [13:51<00:00, 292.55s/it]
 33%|██████████████▎                            | 1/3 [13:51<27:42, 831.30s/it]

  0%|                                                    | 0/3 [00:00<?, ?it/s]

submission_min1201.csv




 33%|██████████████▎                            | 1/3 [03:53<07:47, 233.79s/it]

submission_min1212.csv




 67%|████████████████████████████▋              | 2/3 [05:54<03:19, 199.88s/it]

submission_min1220.csv




100%|███████████████████████████████████████████| 3/3 [10:31<00:00, 223.10s/it]
 67%|████████████████████████████▋              | 2/3 [24:23<12:51, 771.47s/it]

  0%|                                                    | 0/3 [00:00<?, ?it/s]

submission_min2001.csv




 33%|██████████████▎                            | 1/3 [05:37<11:14, 337.29s/it]

submission_min2012.csv




 67%|████████████████████████████▋              | 2/3 [09:31<05:06, 306.37s/it]

submission_min2020.csv




100%|███████████████████████████████████████████| 3/3 [16:00<00:00, 331.24s/it]
100%|████████████████████████████████████████| 2/2 [1:20:48<00:00, 2424.19s/it]
