In [44]:
import pandas as pd
import numpy as np
import datetime
import glob
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [45]:
df = pd.read_csv('./data/test.csv', encoding='windows-1251')

In [46]:
df.head()

Unnamed: 0,Interval,Date,OrderDate,ClientID,ChannelID,OrderID,MaterialID,GroupID,Cluster,OrderCnt,DeliveryType,prepay,count_edit
0,21-22.,04/01/2019,04/01/2019,93307117,2,90102211131,3056480.0,15.0,,1.0,Доставка День в День,0,1
1,21-22.,04/01/2019,04/01/2019,93307117,2,90102211131,2014233.0,12.0,,2.0,Доставка День в День,0,1
2,21-22.,04/01/2019,04/01/2019,93307117,2,90102211131,3226516.0,35.0,,1.0,Доставка День в День,0,1
3,22-0.,05/01/2019,04/01/2019,91590087,2,90102211133,2013713.0,36.0,,1.0,Обычная доставка,0,1
4,22-0.,05/01/2019,04/01/2019,91590087,2,90102211133,3255780.0,24.0,,1.0,Обычная доставка,0,1


In [47]:
df["Cluster"] = df["Cluster"].fillna('-1')
groups_counts = list(df['GroupID'].value_counts()[df['GroupID'].value_counts() > 10000].keys())
df['GroupID'] = df['GroupID'].apply(lambda x: x if x in groups_counts else -1)
materials_counts = list(df['MaterialID'].value_counts()[df['MaterialID'].value_counts() > 10000].keys())
df['MaterialID'] = df['MaterialID'].apply(lambda x: x if x in materials_counts else -1)

In [48]:
cols = ['OrderID', 'Interval', 'Date', 'OrderDate', 'ClientID', 'ChannelID',
       'Cluster', 'DeliveryType', 'prepay', 'count_edit']

In [49]:
df_ = df.groupby(cols).sum().reset_index()[['OrderID', 'Interval', 'Date', 'OrderDate', 'ClientID', 'ChannelID',
       'Cluster', 'DeliveryType', 'prepay', 'count_edit', 'OrderCnt']]

In [50]:
orders = df_['OrderID']

In [9]:
df_dummies = pd.get_dummies(df[df['OrderID'].isin(orders)][['OrderID', 'MaterialID', 'GroupID']], columns=['MaterialID', 'GroupID'])

In [35]:
df_group1 = df_dummies.head(2500005).groupby('OrderID').sum()
df_group2 = df_dummies.tail(len(df_dummies) - 2500005).groupby('OrderID').sum()

In [36]:
df_group = df_group1.append(df_group2).reset_index().drop_duplicates()

In [51]:
df_group

Unnamed: 0,OrderID,MaterialID_-1.0,MaterialID_2012529.0,MaterialID_2013197.0,MaterialID_2013264.0,MaterialID_2013421.0,MaterialID_2013982.0,MaterialID_2013983.0,MaterialID_3139357.0,MaterialID_3186418.0,...,GroupID_34.0,GroupID_35.0,GroupID_36.0,GroupID_41.0,GroupID_42.0,GroupID_55.0,GroupID_59.0,GroupID_61.0,GroupID_63.0,GroupID_66.0
0,90102211131,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,90102211133,22.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,10.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
2,90102216055,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,90102216081,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,90102216084,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202609,98359057260,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
202610,98359057293,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
202611,98359057392,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
202612,98359057487,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,5.0,0.0


In [53]:
df_ = df_.merge(df_group, on='OrderID')

In [54]:
df_['OrderDate_weekday'] = df_['OrderDate'].apply(lambda x: datetime.datetime.strptime(x, '%d/%m/%Y').weekday())
df_['Date_weekday'] = df_['Date'].apply(lambda x: datetime.datetime.strptime(x, '%d/%m/%Y').weekday())

In [55]:
def to_labels(series):
    le = LabelEncoder()
    return le.fit_transform(series)

In [56]:
cols = ['Interval', 'ChannelID', 'DeliveryType', 'Cluster']

for col in cols:
    df_[col] = to_labels(df_[col])

In [57]:
df_

Unnamed: 0,OrderID,Interval,Date,OrderDate,ClientID,ChannelID,Cluster,DeliveryType,prepay,count_edit,...,GroupID_36.0,GroupID_41.0,GroupID_42.0,GroupID_55.0,GroupID_59.0,GroupID_61.0,GroupID_63.0,GroupID_66.0,OrderDate_weekday,Date_weekday
0,90102211131,17,04/01/2019,04/01/2019,93307117,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,4
1,90102211133,19,05/01/2019,04/01/2019,91590087,0,0,1,0,1,...,10.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,4,5
2,90102216055,10,10/01/2019,09/01/2019,100000491,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,3
3,90102216081,23,11/01/2019,10/01/2019,100000491,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3,4
4,90102216084,2,12/01/2019,10/01/2019,91089531,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202609,98359057260,17,31/07/2019,31/07/2019,93252579,0,2,0,0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2
202610,98359057293,17,31/07/2019,31/07/2019,93112429,0,2,0,0,1,...,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2,2
202611,98359057392,21,31/07/2019,31/07/2019,92037052,0,2,0,0,1,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2
202612,98359057487,17,31/07/2019,31/07/2019,92136607,0,2,0,1,1,...,0.0,1.0,0.0,0.0,0.0,1.0,5.0,0.0,2,2


In [58]:
df_res = df_.drop(['Date', 'OrderDate', 'ClientID'], axis=1)

In [59]:
df_res.to_csv('./data/test_pre.csv', index=False)