In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np

from itertools import combinations
from collections import Counter

In [2]:
file_path = './data/hakaton.csv'

In [3]:
#df = pd.read_csv('hakaton.csv',sep=';')

In [4]:
seasons = {'winter': [1,2,12], 
           'spring': [3,4,5],
           'summer': [6,7,8],
           'autumn': [9,10,11]}

hours = {1: [6,7,8,9,10],
         2: [11,12,13,14,15],
         3: [16,17,18,19,20],
         4: [21,22,23,0,1]}

In [5]:
def preprocessing_df(df):
    df['create_datetime'] = pd.to_datetime(df['create_datetime'])
    df['order_day'] = df['create_datetime'].dt.day
    df['order_hour'] = df['create_datetime'].dt.hour
    df['order_weekday'] = df['create_datetime'].dt.weekday
    df['order_month'] = df['create_datetime'].dt.month
    df['order_year'] = df['create_datetime'].dt.year
    df['day_of_week_name'] = df['create_datetime'].dt.day_name()
    df['order_week'] = df['create_datetime'].dt.isocalendar().week
    def categorize_hour(hour):
        for category, hour_list in hours.items():
            if hour in hour_list:
                return category
        return None  # на случай, если час не попадает в список

    # Создаем новый столбец 'hour_category' с категориями
    df['hour_category'] = df['order_hour'].apply(categorize_hour)
    return df

In [6]:
def buy_together(df):
    df_grouped = df.groupby(['order_id','customer_id'])['entity_id'].agg(list).reset_index()
    df_together = df_grouped[df_grouped['entity_id'].apply(len) >= 0]
    return df_together

In [7]:
def popular_products_of_the_day(df, day_date=None, month_date=None, year_date=None, flag=False):
    """
    Выявление товаров на скидку в течение дня с учетом временного интервала
    """
    if day_date and month_date and year_date:
        df = df[(df['order_day'] == day_date) & (df['order_month'] == month_date) & (df['order_year'] == year_date)]

    df = df.groupby(['hour_category', 'entity_id'])['hour_category'].size().reset_index(name='count')
    df = df.sort_values(by='count', ascending=not flag)
    df = df[df['count'] >= 1]
    
    selected_entities = set()
    new_df = pd.DataFrame(columns=df.columns)

    for i in range(1, 5):
        hour_group = df[(df['hour_category'] == i) & (~df['entity_id'].isin(selected_entities))]
        if not hour_group.empty:
            median_index = hour_group.iloc[int(np.median(range(len(hour_group))))].name
            new_df.loc[len(new_df)] = df.loc[median_index]
            selected_entities.add(df.loc[median_index, 'entity_id'])

    return new_df

In [8]:
def daily_product(df, year=None, month=None, flag=False):
    """
    Группировка по дням недели, возвращение товара дня по неделям (без повтора товаров)
    """
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    
    if not year or not month:
        year = df['order_year'].unique().max()
        month = df[df['order_year'] == year]['order_month'].unique().max()

    if month == 1: 
        year -= 1
        month = 12

    df_group_day_name_entity = df[(df['order_year'] == year) & 
                                  (df['order_month'] == month)] \
                                  .groupby(['day_of_week_name', 'entity_id']) \
                                  .size().reset_index(name='count')

    df_group_day_name_entity = df_group_day_name_entity.sort_values(by='count', ascending=False)
    df_group_day_name_entity = df_group_day_name_entity[df_group_day_name_entity['count'] >= 1]
    
    selected_entities = set()
    new_df = pd.DataFrame(columns=df_group_day_name_entity.columns)

    for day in day_order:
        df_group = df_group_day_name_entity[(df_group_day_name_entity['day_of_week_name'] == day) & (~df_group_day_name_entity['entity_id'].isin(selected_entities))]
        if not df_group.empty:
            median_index = df_group.iloc[int(np.median(range(len(df_group))))].name
            new_df.loc[len(new_df)] = df_group_day_name_entity.loc[median_index]
            selected_entities.add(df_group_day_name_entity.loc[median_index, 'entity_id'])

    df_group_day_name_entity = df_group_day_name_entity.drop_duplicates(subset='entity_id', keep='first').drop_duplicates(subset='day_of_week_name', keep='first')[:7]
    df_group_day_name_entity['day_of_week_name'] = pd.Categorical(df_group_day_name_entity['day_of_week_name'], categories=day_order, ordered=True)
    df_group_day_name_entity = df_group_day_name_entity.sort_values('day_of_week_name').reset_index(drop=True)
    
    return new_df,df_group_day_name_entity

In [9]:
def find_most_frequent_pairs(df,customer_id = None):
    """
    Функция находит наиболее часто покупаемые товары вместе.
    
    :param df: DataFrame с колонками 'order_id' и 'entity_id' (где entity_id - список товаров)
    :return: DataFrame с парами товаров и их частотой
    """
    if customer_id:
        find_df = df[df['customer_id'] == customer_id]
    else:
        find_df = df
        
    def generate_item_pairs(item_list):
        return list(combinations(sorted(item_list), 2))

    all_pairs = []
    for item_list in find_df['entity_id']:
        all_pairs.extend(generate_item_pairs(item_list))
        
    pair_counts = Counter(all_pairs)
    pair_counts_df = pd.DataFrame(pair_counts.items(), columns=['pair', 'count'])
    pair_counts_df = pair_counts_df[pair_counts_df['pair'].apply(lambda x: x[0] != x[1])]
    pair_counts_df_sorted = pair_counts_df.sort_values(by='count', ascending=False).reset_index(drop=True)
    pair_counts_df_sorted = pair_counts_df_sorted[pair_counts_df_sorted['count'] >= 2]
    
    if len(pair_counts_df_sorted) < 3:
        return find_most_frequent_pairs(df) 

    return pair_counts_df_sorted

In [10]:
def find_frequent_pairs_with_entity(df, entity_id):
    """
    Функция находит наиболее часто встречающиеся пары товаров с указанным entity_id,
    и возвращает только вторые товары в этих парах.
    
    :param df: DataFrame с колонками 'pair' и 'count' (где 'pair' - пара товаров, 'count' - частота)
    :param entity_id: Идентификатор товара, с которым ищем часто встречающиеся пары
    :return: DataFrame с вторыми товарами в парах и их частотой
    """
    filtered_df = df[df['pair'].apply(lambda x: entity_id in x)].copy()
    filtered_df['second_item'] = filtered_df['pair'].apply(lambda x: x[1] if x[0] == entity_id else x[0])
    filtered_df_sorted = filtered_df[['second_item', 'count']].sort_values(by='count', ascending=False).reset_index(drop=True)

    median_index = np.median(filtered_df_sorted.index)
    indices_near_median = filtered_df_sorted.index[(filtered_df_sorted.index >= median_index - 1) & (filtered_df_sorted.index <= median_index + 1)]
    filtered_df_sorted = filtered_df_sorted.loc[indices_near_median]
    
    return filtered_df_sorted[:3].loc[:,'second_item'].values

In [11]:
def metrics_client(df,customer_id,entitys_id):
    offered_products_set = set()
    for i in entitys_id:
        offered_products = find_frequent_pairs_with_entity(most_frequent_pairs, i)
        offered_products_set.update(offered_products)
    offered_products_arr = [product for product in offered_products_set if product not in entitys_id]
    return offered_products_arr

In [12]:
#Предобработка, должна выполнятся по загрузке файла
df = preprocessing_df(df) # Обработка датафрейма
df_buy_together = buy_together(df) # Объединение заказов
most_frequent_pairs_customer = find_most_frequent_pairs(df_buy_together, 9172) #Передача id клиента, выдача часто сочетаемых товаров у клиента
most_frequent_pairs = find_most_frequent_pairs(df_buy_together) #Часто сочетаемые товары

- 1-ый параметр - датафрейм с часто сочетаемыми товарами (можно передать часто сочетаемые товары клиента)
- 2-ой параметр - id клиента
- 3-ий параметр  - корзина покупок

In [13]:
metrics_client(most_frequent_pairs, 9172, [585]) 

[499, 476, 92]