In [1]:
!pip install -U xgboost



In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder

In [3]:
RANDOM_STATE=42

In [4]:
# Загрузка данных
orders = pd.read_csv(r"C:\Users\ysx12\Downloads\data_secrets_first_cup\Data Secrets First Cup\orders.csv")
train = pd.read_csv(r"C:\Users\ysx12\Downloads\data_secrets_first_cup\Data Secrets First Cup\train_target.csv")
october = pd.read_csv(r"C:\Users\ysx12\Downloads\data_secrets_first_cup\Data Secrets First Cup\clients_promo_october.csv")
mobile = pd.read_csv(r"C:\Users\ysx12\Downloads\data_secrets_first_cup\Data Secrets First Cup\mobile_events.csv")
test = pd.read_csv(r"C:\Users\ysx12\Downloads\data_secrets_first_cup\Data Secrets First Cup\test.csv")
submit = pd.read_csv(r"C:\Users\ysx12\Downloads\data_secrets_first_cup\Data Secrets First Cup\submit.csv")
orders['Date'] = pd.to_datetime(orders['Date'])

In [5]:
orders.head()

Unnamed: 0,OrderUUId,addressId,deliverySectorId,ClientUUId,Date,SaleDate,UnitUUId,NewClient,ClientOrderNumber,ProductUUId,CategoryId,ProductTotalPrice,MenuPrice,OrderState,OrderPaymentType,OrderTotalPrice,OrderType,apply_promo
0,020DD76CCD949AE011EDE031ED5D76FD,,,000D3AAC9DCABB2E11EBE0614C89D044,2023-04-21,2023-04-21T20:51:26.000Z,000D3A2480C380DA11E692CFB9F92484,0,18,000D3A240C71BE9A11E719BE2AB264A6,4,1.0,45.0,4,2,1.0,3,0
1,02164D9B0F67BC3311EE610987AF48C7,0.0,0.0,000D3A21DA51A81411EAF10674007D74,2023-10-02,2023-10-02T21:31:32.000Z,000D3A26B5B080DF11E745FD0474B95C,0,29,11EC9AABAD354BE051CCF1EE1F83A8D0,1,485.0,485.0,4,2,1164.0,1,0
2,02164D9B0F67BC3311EE610987AF48C7,0.0,0.0,000D3A21DA51A81411EAF10674007D74,2023-10-02,2023-10-02T21:31:32.000Z,000D3A26B5B080DF11E745FD0474B95C,0,29,11EB846583CB3BDEA80D1A7F2E184D40,1,589.0,589.0,4,2,1164.0,1,0
3,02164D9B0F67BC3311EE610987AF48C7,0.0,0.0,000D3A21DA51A81411EAF10674007D74,2023-10-02,2023-10-02T21:31:32.000Z,000D3A26B5B080DF11E745FD0474B95C,0,29,000D3A240C71BE9A11E719BE2AB267FD,4,45.0,45.0,4,2,1164.0,1,0
4,02164D9B0F67BC3311EE610987AF48C7,0.0,0.0,000D3A21DA51A81411EAF10674007D74,2023-10-02,2023-10-02T21:31:32.000Z,000D3A26B5B080DF11E745FD0474B95C,0,29,000D3A240C71BE9A11E719BE2AB264A6,4,45.0,45.0,4,2,1164.0,1,0


In [6]:
# Установим дату для расчета
current_date = pd.to_datetime('2023-11-01')

In [7]:
# Функция для генерации признаков на основе данных заказов
def create_features_for_user(df, current_date):
    features = {}
    features['ClientUUId'] = df['ClientUUId'].iloc[0]

    # Признаки по заказам
    last_month_orders = df[df['Date'] >= current_date - pd.DateOffset(months=1)]
    features['num_orders_last_month'] = len(last_month_orders)
    
    df = df.sort_values('Date')
    df['days_since_last_order'] = df['Date'].diff().dt.days
    features['mean_days_between_orders'] = df['days_since_last_order'].mean()
    
    last_order_date = df['Date'].max()
    features['days_since_last_order'] = (current_date - last_order_date).days
    
    first_order_date = df['Date'].min()
    features['days_between_first_and_last_order'] = (last_order_date - first_order_date).days
    
    workdays_orders = df[df['Date'].dt.weekday < 5]  # Заказы по будням
    features['num_orders_workdays'] = len(workdays_orders)
    
    # Признаки использования промокодов
    features['days_since_last_promo'] = (current_date - df[df['apply_promo'] == 1]['Date'].max()).days if df['apply_promo'].any() else 1000
    features['promo_used_last_month'] = int(len(last_month_orders[last_month_orders['apply_promo'] == 1]) > 0)

    return features

In [8]:
os.makedirs("data", exist_ok=True)
# Создание датафрейма признаков на основе заказов
features_list = [create_features_for_user(group, current_date) for _, group in orders.groupby('ClientUUId')]
features_df = pd.DataFrame(features_list)
features_df.to_csv("data/date_features.csv", index=False)

In [9]:
# Подготовка мобильных данных
mobile = pd.get_dummies(mobile, columns=['EventName', 'Platform'], dtype=int, prefix='')
mobile = mobile.groupby('ClientUUId').sum().reset_index()

In [10]:
# Расчёт скидки как разница между полной ценой и ценой по меню
orders['discont'] = orders['MenuPrice'] - orders['ProductTotalPrice']

# Кодирование типа оплаты
orders = pd.get_dummies(orders, columns=['OrderPaymentType'], prefix='pay')

# Группировка по клиенту для вычисления средних и суммарных значений
orders_features = orders.groupby('ClientUUId').agg(
    avg_discont=('discont', 'mean'),        # Средняя скидка
    sum_discont=('discont', 'sum'),         # Общая сумма скидок
    in_delivery_mean=('OrderType', 'mean'), # Средний тип заказа (предполагается, что 1=доставка, 0=самовывоз, если такие значения есть)
    promo_used_mean=('apply_promo', 'mean') # Средняя частота использования промо
).reset_index()

In [11]:
# Обработка данных train и test
train['n_promos'] = train.groupby('ClientUUId')['LocalEndDate'].transform('count')
test['n_promos'] = train.groupby('ClientUUId')['LocalEndDate'].transform('count')
for col in ['OrderPrice', 'Discount']:
    train[f'avg_{col}'] = train.groupby('ClientUUId')[col].transform('mean')
    test[f'avg_{col}'] = test.groupby('ClientUUId')[col].transform('mean')

In [12]:
# Объединение всех признаков в один датафрейм
dates = pd.read_csv('data/date_features.csv')
df = train.merge(orders_features, on='ClientUUId', how='left')\
          .merge(mobile, on='ClientUUId', how='left')\
          .merge(dates, on='ClientUUId', how='left')
df_test = test.merge(orders_features, on='ClientUUId', how='left')\
              .merge(mobile, on='ClientUUId', how='left')\
              .merge(dates, on='ClientUUId', how='left')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27741 entries, 0 to 27740
Data columns (total 37 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ClientUUId                         27741 non-null  object 
 1   Id                                 27741 non-null  int64  
 2   OrderType                          27741 non-null  object 
 3   LocalBeginDate                     27741 non-null  object 
 4   LocalEndDate                       27741 non-null  object 
 5   OrderPrice                         27741 non-null  int64  
 6   Discount                           27741 non-null  int64  
 7   apply_promo                        27741 non-null  int64  
 8   n_promos                           27741 non-null  int64  
 9   avg_OrderPrice                     27741 non-null  float64
 10  avg_Discount                       27741 non-null  float64
 11  avg_discont                        15020 non-null  flo

In [14]:
pd.set_option('display.max_columns', None)

In [15]:
df.head()

Unnamed: 0,ClientUUId,Id,OrderType,LocalBeginDate,LocalEndDate,OrderPrice,Discount,apply_promo,n_promos,avg_OrderPrice,avg_Discount,avg_discont,sum_discont,in_delivery_mean,promo_used_mean,VisitToken,Timestamp,_add_to_cart,_apply_personal_offer,_close_app,_create_order,_open_app,_open_bonusaction,_open_product_card,_remove_from_cart,_screen_cart,_screen_menu,_screen_profile,_android,_ios,num_orders_last_month,mean_days_between_orders,days_since_last_order,days_between_first_and_last_order,num_orders_workdays,days_since_last_promo,promo_used_last_month
0,000D3A20F23EA95811E7B99F3ED09FC8,7,23,2023-11-02T00:00:00.000Z,2023-11-05T23:59:00.000Z,699,200,0,2,974.0,112.5,91.545455,3021.0,1.787879,0.0,70F9C9A4-52F1-4F04-9B77-FBBA375E5BF4DC868203-0...,2023-10-16T17:16:36.127Z2023-10-22T13:40:04.92...,11.0,0.0,43.0,4.0,43.0,0.0,22.0,2.0,8.0,54.0,0.0,0.0,187.0,11.0,2.0625,1.0,66.0,31.0,1000.0,0.0
1,000D3A20F23EA95811E7B99F3ED09FC8,6,123,2023-11-02T00:00:00.000Z,2023-11-05T23:59:00.000Z,1249,25,0,2,974.0,112.5,91.545455,3021.0,1.787879,0.0,70F9C9A4-52F1-4F04-9B77-FBBA375E5BF4DC868203-0...,2023-10-16T17:16:36.127Z2023-10-22T13:40:04.92...,11.0,0.0,43.0,4.0,43.0,0.0,22.0,2.0,8.0,54.0,0.0,0.0,187.0,11.0,2.0625,1.0,66.0,31.0,1000.0,0.0
2,000D3A20F23EA95811E7BD373E79565E,5,23,2023-11-02T00:00:00.000Z,2023-11-05T23:59:00.000Z,799,200,0,2,1024.0,225.0,,,,,4b2a8dbb-f274-4c7d-8a3c-f659367a376eddf56e4e-b...,2023-10-08T13:09:03.8650000Z2023-10-13T15:21:2...,5.0,0.0,16.0,2.0,15.0,0.0,6.0,0.0,11.0,12.0,30.0,97.0,0.0,,,,,,,
3,000D3A20F23EA95811E7BD373E79565E,5,123,2023-11-02T00:00:00.000Z,2023-11-05T23:59:00.000Z,1249,250,0,2,1024.0,225.0,,,,,4b2a8dbb-f274-4c7d-8a3c-f659367a376eddf56e4e-b...,2023-10-08T13:09:03.8650000Z2023-10-13T15:21:2...,5.0,0.0,16.0,2.0,15.0,0.0,6.0,0.0,11.0,12.0,30.0,97.0,0.0,,,,,,,
4,000D3A20F23EA95811E7BDAE71CD3F33,6,23,2023-11-02T00:00:00.000Z,2023-11-05T23:59:00.000Z,699,30,0,2,974.0,140.0,,,,,4c651a9e-dc31-4b33-82d3-67af9132e69a4c651a9e-d...,2023-10-13T16:35:49.3960000Z2023-10-13T16:34:5...,9.0,0.0,19.0,3.0,21.0,0.0,12.0,1.0,11.0,16.0,11.0,103.0,0.0,,,,,,,


In [21]:
columns_to_drop = ['LocalBeginDate', 'LocalEndDate', 'ClientUUId', 'OrderType', 'VisitToken', 'Timestamp']

# Удаляем только те колонки, которые присутствуют в DataFrame
df = df.fillna(0).drop([col for col in columns_to_drop if col in df.columns], axis=1)
df_test = df_test.fillna(0).drop([col for col in columns_to_drop if col in df_test.columns], axis=1)

# Разделение данных на X и y
X = df.drop('apply_promo', axis=1)
y = df['apply_promo']

In [22]:
# Определение категорий для каждой модели
lin_cols = ['days_between_first_and_last_order', 'num_orders_workdays', 'promo_used_last_month']
xg_cols = ['promo_used_last_month']

In [23]:
date_columns = [col for col in df_test.columns if df_test[col].dtype == 'object' and df_test[col].str.contains(r'\d{4}-\d{2}-\d{2}T').any()]
df_test = df_test.drop(columns=date_columns)
df = df.drop(columns=date_columns)

In [24]:
# Линейная модель
kf = StratifiedKFold(n_splits=6, shuffle=True, random_state=RANDOM_STATE)
lin_models = []
test_scores, train_scores = [], []

for train_index, test_index in kf.split(X=X.drop(lin_cols, axis=1), y=y):
    X_train, X_test = X.iloc[train_index].drop(lin_cols, axis=1), X.iloc[test_index].drop(lin_cols, axis=1)
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    lin_reg = LinearRegression().fit(X_train, y_train)
    train_scores.append(roc_auc_score(y_train, lin_reg.predict(X_train)))
    test_scores.append(roc_auc_score(y_test, lin_reg.predict(X_test)))
    lin_models.append(lin_reg)

print(f"Linear model mean score: {np.mean(test_scores):.4f} ± {np.std(test_scores):.4f}")

Linear model mean score: 0.7717 ± 0.0245


In [25]:
lin_preds = sum([model.predict(df_test.drop(lin_cols, axis=1)) for model in lin_models]) / len(lin_models)

In [26]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
xg_models = []
xg_test_scores, xg_train_scores = [], []

for train_index, test_index in kf.split(X=X.drop(xg_cols, axis=1), y=y):
    # Разделение данных на тренировочные и тестовые
    X_train, X_test = X.iloc[train_index].drop(xg_cols, axis=1), X.iloc[test_index].drop(xg_cols, axis=1)
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Инициализация модели XGBoost
    xgb = XGBClassifier(eval_metric="auc", n_estimators=1000, max_depth=3, subsample=0.9, random_state=RANDOM_STATE, enable_categorical=True)

    xgb.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=0
    )

    # Сохранение модели и оценка
    xg_models.append(xgb)
    xg_train_scores.append(roc_auc_score(y_train, xgb.predict_proba(X_train)[:, 1]))
    xg_test_scores.append(roc_auc_score(y_test, xgb.predict_proba(X_test)[:, 1]))

# Вывод среднего значения AUC
print(f"XGBoost mean score: {np.mean(xg_test_scores):.4f} ± {np.std(xg_test_scores):.4f}")

XGBoost mean score: 0.8638 ± 0.0209


In [27]:
xg_preds = sum([model.predict_proba(df_test.drop(xg_cols, axis=1))[:, 1] for model in xg_models]) / len(xg_models)

In [29]:
# CatBoost модель
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cat_models = []
cat_test_scores, cat_train_scores = [], []

for train_index, test_index in kf.split(X=X, y=y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf = CatBoostClassifier(iterations=2000, depth=2, colsample_bylevel=0.5, eval_metric='AUC', random_seed=42)
    clf.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=0, early_stopping_rounds=50)
    cat_models.append(clf)
    cat_train_scores.append(roc_auc_score(y_train, clf.predict_proba(X_train)[:, 1]))
    cat_test_scores.append(roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

print(f"CatBoost mean score: {np.mean(cat_test_scores):.4f} ± {np.std(cat_test_scores):.4f}")

CatBoost mean score: 0.8090 ± 0.0236


In [30]:
cat_preds = sum([model.predict_proba(df_test)[:, 1] for model in cat_models]) / len(cat_models)


In [31]:
final_preds = 1.2 * lin_preds + 1.8 * xg_preds + cat_preds
submit['target'] = final_preds
submit.to_csv("final_submission.csv", index=False)