In [1]:
# *By saying "distict item", I mean that the number of such goods in one order is not taken into account

<h4> Prices and Counters </h4>
<ul>
    <li><i>successful_orders_count</i> - number of total delivered orderes that were performed by the client </li>
    <li><i>avg_unique_items_per_order</i> - number of distict items that are purchased on average in one order </li>
    <li><i>all_unique_items_purchased</i> - total number of distict items that has ever been purchased across all orders made </li>
    <li><i>avg_items_per_order</i> - number of items purchased on average in one order </li>
    <li><i>all_items_purchased</i> - total number of items ever purchased by the client across all orders made </li>
    <li><i>avg_item_price</i> - average price of one good in one order </li>
    <li><i>avg_unique_items_price</i> - average price of one distict good in one order </li>
    <li><i>all_items_price</i> - total price of all items ever purchased by the client </li>
    <li><i>avg_summed_cost_of_items_per_order</i> - average price of all items summed in one order </li>
    <li><i>total_shipping_charges</i> - total cost of all shipping costs ever charged </li>
    <li><i>avg_shipping_charges</i> - average cost of shipping charges per order </li>
    <li><i>total_service_charges</i> - total cost of all service costs ever charged </li>
    <li><i>avg_service_charges</i> - total cost of service charges per order </li>
    <li><i>total_payment</i> - total value paid by the client </li>
    <li><i>avg_payment</i> - average value paid by the client per order </li>
    <li><i>total_profit</i> - total profit for one client </li>
    <li><i>avg_order_profit</i> - average profit per order </li>
    <li><i>avg_item_profit</i> - average profit per item </li><br>
</ul>
<h4>Share of items in a certain category</h4>
<ul>
    <li><i>related products</i></li>
    <li><i>baby food</i></li>
    <li><i>office goods</i></li>
    <li><i>books, disks</i></li>
    <li><i>breastfeeding products</i></li>
    <li><i>textile, knitwear</i></li>
    <li><i>footwear</i></li>
    <li><i>oversized goods</i></li>
    <li><i>toys</i></li>
    <li><i>women things</i></li>
    <li><i>goods for pets</i></li>
    <li><i>cosmetics and hygiene</i></li>
    <li><i>diapers</i></li>
</ul>
<h4>Share of different shipping methods used</h4>
<ul>
    <li><i>pickup</i></li>
    <li><i>express delivery</i></li>
    <li><i>transport company</i></li>
    <li><i>DPD</i></li>
    <li><i>pick point</i></li>
    <li><i>store</i></li>
</ul>
<h4>Share of item size category</h4>
<ul>
    <li><i>Oversized</i></li>
    <li><i>Compact</i></li>
</ul>
<h4>Share of payment method used</h4>
<ul>
    <li><i>cash payment</i></li>
    <li><i>cashless payment</i></li>
</ul>
<h4>Canceled and delivered orders</h4>
<ul>
    <li><i>number_canceled</i> - total number of canceled orders per client</li>
    <li><i>avg_number_canceled</i> - average number of canceled orders per client</li>
    <li><i>total_delivered</i> - total number of delivered orders per client </li>
    <li><i>avg_delivered</i> - average number of delivered orders per client </li>
</ul>

In [3]:
import pandas as pd
import numpy as np
from scipy import stats

In [4]:
data = pd.read_parquet('./data/clean/all_positions.parquet')

In [5]:
data.columns

Index(['Дата', 'ДатаДоставки', 'НомерЗаказаНаСайте', 'НовыйСтатус',
       'СуммаЗаказаНаСайте', 'СуммаДокумента', 'МетодДоставки', 'ФормаОплаты',
       'Регион', 'Группа2', 'Группа3', 'Группа4', 'Тип', 'Номенклатура',
       'ТипТовара', 'Отменено', 'ПричинаОтмены', 'Количество', 'Цена',
       'СуммаСтроки', 'ЦенаЗакупки', 'МесяцДатыЗаказа', 'ГодДатыЗаказа',
       'ПВЗ_код', 'Статус', 'Гео', 'Маржа', 'СуммаУслуг', 'СуммаДоставки',
       'НомерСтроки', 'КоличествоПроданоКлиенту', 'ДатаЗаказаНаСайте',
       'Телефон_new', 'ЭлектроннаяПочта_new', 'Клиент', 'ID_SKU',
       'ГородМагазина', 'МагазинЗаказа'],
      dtype='object')

<h3>Categories counter</h3>

In [6]:
index_sort = (data['НовыйСтатус'] == 'Доставлен') & (data['Отменено'] == 'Нет')

In [7]:
# sum the number of the items within a group (item wise)
def sum_f(x, tag):
    return (np.array(x).reshape((-1)) == tag).sum()

# set 1 if the type exists, 0 otherwise (order wise)
def max_f(x, tag):
    return int((np.array(x).reshape((-1)) == tag).max())

In [8]:
%%time
items_groups = data[index_sort].groupby(['Телефон_new', 'НомерЗаказаНаСайте']).agg({
    'Группа2': [
        ('related products', lambda x: sum_f(x,'СОПУТСТВУЮЩИЕ ТОВАРЫ')),
        ('baby food', lambda x: sum_f(x,'ДЕТСКОЕ ПИТАНИЕ')),
        ('office goods', lambda x: sum_f(x,'КАНЦТОВАРЫ, КНИГИ, ДИСКИ')),
        ('books, disks', lambda x: sum_f(x,'ДЕТСКОЕ ПИТАНИЕ')),
        ('breastfeeding products', lambda x: sum_f(x,'ТОВАРЫ ДЛЯ КОРМЛЕНИЯ')),
        ('textile, knitwear', lambda x: sum_f(x,'ТЕКСТИЛЬ, ТРИКОТАЖ')),
        ('footwear', lambda x: sum_f(x,'ОБУВЬ')),
        ('oversized goods', lambda x: sum_f(x,'КРУПНОГАБАРИТНЫЙ ТОВАР')),
        ('toys', lambda x: sum_f(x,'ИГРУШКИ')),
        ('women things', lambda x: sum_f(x,'ЖЕНСКИЕ ШТУЧКИ')),
        ('goods for pets', lambda x: sum_f(x,'ТОВАРЫ ДЛЯ ЖИВОТНЫХ')),
        ('cosmetics and hygiene', lambda x: sum_f(x,'КОСМЕТИКА/ГИГИЕНА')),
        ('diapers', lambda x: sum_f(x,'ПОДГУЗНИКИ')),
        ('none', lambda x: sum_f(x,None))
    ]
})

items_groups.columns = items_groups.columns.get_level_values(1)
items_groups = items_groups.groupby(level=0).agg('sum')

CPU times: user 39.1 s, sys: 476 ms, total: 39.6 s
Wall time: 40 s


In [9]:
items_groups.shape

(67863, 14)

<h3>Delivery method</h3>

In [10]:
list(set(data['МетодДоставки'].values))

['DPD',
 'Курьерская',
 'Pick point',
 'Самовывоз',
 'Транспортная компания',
 'Магазины']

In [11]:
%%time
shipping_groups = data[index_sort].groupby(['Телефон_new', 'НомерЗаказаНаСайте']).agg({
    'МетодДоставки': [
        ('pickup', lambda x: max_f(x,'Самовывоз')),
        ('express delivery', lambda x: max_f(x,'Курьерская')),
        ('transport company', lambda x: max_f(x,'Транспортная компания')),
        ('DPD', lambda x: max_f(x,'DPD')),
        ('pick point', lambda x: max_f(x,'Pick point')),
        ('store', lambda x: max_f(x,'Магазины'))
    ]
})

shipping_groups.columns = shipping_groups.columns.get_level_values(1)
shipping_groups = shipping_groups.groupby(level=0).agg('sum')

CPU times: user 16 s, sys: 363 ms, total: 16.4 s
Wall time: 16.3 s


In [12]:
shipping_groups.shape

(67863, 6)

In [13]:
shipping_groups.head(3)

Unnamed: 0_level_0,pickup,express delivery,transport company,DPD,pick point,store
Телефон_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
49574954-56524849545119,0,0,0,0,0,1
55525753-54505353554970,0,0,0,0,0,1
55525757-49565652494875,0,0,0,0,0,1


In [14]:
shipping_groups.sum(axis=0) # there are no other pickup points

pickup                   0
express delivery         0
transport company        0
DPD                      0
pick point               0
store                95688
dtype: int64

<h3>Oversized / Undersized stats</h3>

In [15]:
list(set(data['ТипТовара']))

['МГТ', 'КГТ', None]

In [16]:
%%time
item_type_stats = data[index_sort].groupby(['Телефон_new', 'НомерЗаказаНаСайте']).agg({
    'ТипТовара': [
        ('Oversized', lambda x: sum_f(x, 'КГТ')),
        ('Compact', lambda x: sum_f(x, 'МГТ')),
        ('Unknown', lambda x: sum_f(x, None)),
    ]
})

item_type_stats.columns = item_type_stats.columns.get_level_values(1)
item_type_stats = item_type_stats.groupby(level=0).agg('sum')

CPU times: user 7.57 s, sys: 88.2 ms, total: 7.65 s
Wall time: 7.66 s


In [17]:
item_type_stats.shape

(67863, 3)

In [18]:
item_type_stats.head(3)

Unnamed: 0_level_0,Oversized,Compact,Unknown
Телефон_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
49574954-56524849545119,0,1,1
55525753-54505353554970,1,0,0
55525757-49565652494875,0,1,1


<h3>Payment methods</h3>

In [19]:
list(set(data['ФормаОплаты']))

['Безналичная', 'Наличная']

In [20]:
%%time
payment_method_stats = data[index_sort].groupby(['Телефон_new', 'НомерЗаказаНаСайте']).agg({
    'ФормаОплаты': [
        ('cash payment', lambda x: max_f(x, 'Наличная')),
        ('cashless payment', lambda x: max_f(x, 'Безналичная'))
    ]
})

payment_method_stats.columns = payment_method_stats.columns.get_level_values(1)
payment_method_stats = payment_method_stats.groupby(level=0).agg('sum')

CPU times: user 5.46 s, sys: 282 ms, total: 5.74 s
Wall time: 5.57 s


In [21]:
payment_method_stats.shape

(67863, 2)

<h3>Region</h3>

In [22]:
regions = list(set(data['Регион']))
regions

['Люберцы (Люберецкий район)',
 'Самара',
 'Екатеринбург',
 'Ростов-на-Дону',
 'Пермь',
 'Санкт-Петербург',
 'Краснодар',
 'Москва',
 'Нижний Новгород',
 'Воронеж']

In [23]:
%%time
regions_stats = data[index_sort].groupby(['Телефон_new', 'НомерЗаказаНаСайте']).agg({
    'Регион': [
        ('Краснодар', lambda x: max_f(x, 'Краснодар')),
        ('Ростов-на-Дону', lambda x: max_f(x, 'Ростов-на-Дону')),
        ('Нижний Новгород', lambda x: max_f(x, 'Нижний Новгород')),
        ('Воронеж', lambda x: max_f(x, 'Воронеж')),
        ('Люберцы (Люберецкий район)', lambda x: max_f(x, 'Люберцы (Люберецкий район)')),
        ('Санкт-Петербург', lambda x: max_f(x, 'Санкт-Петербург')),
        ('Екатеринбург', lambda x: max_f(x, 'Екатеринбург')),
        ('Москва', lambda x: max_f(x, 'Москва')),
        ('Пермь', lambda x: max_f(x, 'Пермь')),
        ('Самара', lambda x: max_f(x, 'Самара'))
    ]
})

regions_stats.columns = regions_stats.columns.get_level_values(1)
regions_stats = regions_stats.groupby(level=0).agg('sum')

CPU times: user 26.5 s, sys: 665 ms, total: 27.2 s
Wall time: 27 s


In [24]:
regions_stats.shape

(67863, 10)

In [25]:
regions_stats.head(3)

Unnamed: 0_level_0,Краснодар,Ростов-на-Дону,Нижний Новгород,Воронеж,Люберцы (Люберецкий район),Санкт-Петербург,Екатеринбург,Москва,Пермь,Самара
Телефон_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
49574954-56524849545119,0,0,0,0,0,0,0,1,0,0
55525753-54505353554970,0,0,0,0,0,0,0,1,0,0
55525757-49565652494875,0,0,0,0,0,0,0,1,0,0


<h3>Inside order aggregation</h3>

In [26]:
# count only delivered non calceled items stats
general_stats = data[index_sort].groupby(['Телефон_new', 'НомерЗаказаНаСайте']).agg(
    # count the number of unique items in order
    unique_items_count = ('Количество', 'count'),
    # count overall number of items in order
    all_items_count = ('Количество', 'sum'),
    # average price of all items in order
    avg_item_price = ('Цена', 'mean'),
    # average price of unique items in order
    avg_unique_items_price = ('СуммаСтроки', 'mean'),
    # summed items price
    all_items_price = ('Цена', 'sum'),
    # charge for shipping
    shipping_charge = ('СуммаДоставки', 'mean'),
    # order additional charges
    all_additional_charges = ('СуммаУслуг', 'mean'),
    # order price
    final_order_price = ('СуммаЗаказаНаСайте', 'mean'),
    # order profit
    total_margin = ('Маржа', 'sum'),
    # average profit from item in order
    avg_item_margin = ('Маржа', 'mean')
)

<h3>Phone number aggregation</h3>

In [27]:
# work with only non-canceled items and delivered orders
users_stats = general_stats.groupby(level = 0).agg(
    # number of successful orders
    successful_orders_count = ('all_items_count', 'count'),
    # avg number of unique items in order
    avg_unique_items_per_order = ('unique_items_count', 'mean'),
    # total number of unique items ever purchased
    all_unique_items_purchased = ('unique_items_count', 'sum'),
    # avg number of items in order
    avg_items_per_order = ('all_items_count', 'mean'),
    # total number of items ever purchased
    all_items_purchased = ('all_items_count', 'sum'),
    
    # avg item price
    avg_item_price = ('avg_item_price', 'mean'),
    # avg unique items price
    avg_unique_items_price = ('avg_unique_items_price', 'mean'),
    # all_items_price
    all_items_price = ('all_items_price', 'sum'),
    # avg items_price per order
    avg_summed_cost_of_items_per_order = ('all_items_price', 'mean'),
    
    
    # total shipping charge
    total_shipping_charges = ('shipping_charge', 'sum'),
    # avg shipping charge
    avg_shipping_charges = ('shipping_charge', 'mean'),
    # total service charges
    total_service_charges = ('all_additional_charges', 'sum'),
    # avg service charges
    avg_service_charges = ('all_additional_charges', 'mean'),
    
    # total orders payment
    total_payment = ('final_order_price', 'sum'),
    # avg orders payment
    avg_payment = ('final_order_price', 'mean'),
    
    
    # total profit
    total_profit = ('total_margin', 'sum'),
    # average profit per order
    avg_order_profit = ('total_margin', 'mean'),
    # average profit per item
    avg_item_profit = ('avg_item_margin', 'mean')
)

In [28]:
users_stats.shape

(67863, 18)

In [29]:
users_stats.head(3)

Unnamed: 0_level_0,successful_orders_count,avg_unique_items_per_order,all_unique_items_purchased,avg_items_per_order,all_items_purchased,avg_item_price,avg_unique_items_price,all_items_price,avg_summed_cost_of_items_per_order,total_shipping_charges,avg_shipping_charges,total_service_charges,avg_service_charges,total_payment,avg_payment,total_profit,avg_order_profit,avg_item_profit
Телефон_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
49574954-56524849545119,1,2.0,2,2.0,2.0,2607.5,2607.5,5215.0,5215.0,49.0,49.0,49.0,49.0,5215.0,5215.0,-704.86,-704.86,-704.86
55525753-54505353554970,1,1.0,1,1.0,1.0,493.0,493.0,493.0,493.0,0.0,0.0,0.0,0.0,493.0,493.0,306.0,306.0,306.0
55525757-49565652494875,1,2.0,2,2.0,2.0,181.0,181.0,362.0,362.0,49.0,49.0,49.0,49.0,362.0,362.0,20.88,20.88,20.88


<h3>Cancellations stats</h3>

In [30]:
# number of orders that are canceled
cancellations_stats = data.groupby(['Телефон_new', 'НомерЗаказаНаСайте']).agg({
    'Отменено': [
        ('number_canceled', lambda x: sum_f(x,'Да')),
        ('avg_number_canceled', lambda x: (np.array(x) == 'Да').mean())
    ],
    'Статус': [
        ('is_delivered', lambda x: int((np.array(x) == 'Доставлен').max())) # at least one item was delivered
    ]
})
cancellations_stats.columns = cancellations_stats.columns.get_level_values(1)
cancellations_stats = cancellations_stats.groupby(level=0).agg(
    # total number of canceled items
    number_canceled = ('number_canceled', 'sum'),
    # average number of items canceled in order
    avg_number_canceled = ('avg_number_canceled', 'mean'),
    # total orders delivered
    total_delivered = ('is_delivered', 'sum'),
    # average orders delivered out of all orders made
    avg_delivered = ('is_delivered', 'mean')
)

In [31]:
cancellations_stats.shape

(130752, 4)

<h3>Put it all together</h3>

In [32]:
# valid phone numbers
valid_phones = list(set(data[index_sort]['Телефон_new']))
cancellations_stats_valid = cancellations_stats.loc[valid_phones]

In [33]:
# get shares instead of counters
items_groups_share = items_groups.div(users_stats['all_unique_items_purchased'].values, axis='rows')
shipping_groups_share = shipping_groups.div(users_stats['successful_orders_count'].values, axis='rows')
item_type_stats_share =  item_type_stats.div(users_stats['all_unique_items_purchased'].values, axis='rows')
payment_method_stats_share = payment_method_stats.div(users_stats['successful_orders_count'].values, axis='rows')

In [34]:
# check shape
assert users_stats.shape[0] == items_groups_share.shape[0] == shipping_groups_share.shape[0] ==\
item_type_stats_share.shape[0] == payment_method_stats_share.shape[0] == cancellations_stats_valid.shape[0]

In [35]:
data_mart = users_stats.join(items_groups_share)
#data_mart = data_mart.join(shipping_groups_share)
data_mart = data_mart.join(item_type_stats_share)
#data_mart = data_mart.join(payment_method_stats_share)
data_mart = data_mart.join(cancellations_stats_valid)

In [36]:
data_mart.index = data_mart.index.rename('phone')
data_mart = data_mart.iloc[1:]

In [37]:
# remove nan
data_mart = data_mart[data_mart.isna().sum(axis=1) == 0]

<h3>Save</h3>

In [38]:
from pathlib import Path

In [39]:
Path("./data/mart").mkdir(parents=True, exist_ok=True)

In [40]:
data_mart.to_parquet('./data/mart/data_mart.parquet')