# Аcсоциативные правила

В файле `Groceries_dataset.csv` содержится информация о покупках:
 - `uid` — идентификатор покупателя;
 - `date` — дата покупки;
 - `product` — купленный товар.

Необходимо, воспользовавшись этими данными, выяснить, какие пары товаров пользователи чаще всего покупают вместе.

Инструменты: SQL, Python

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from pathlib import Path

In [2]:
with open(Path.home().joinpath('.pgpass')) as auth:
    host, port, table, user, _ = auth.read().rstrip().split(':')
    
engine = create_engine(f'postgresql+psycopg2://{user}:@{host}:{port}/{table}')

In [3]:
df = pd.read_csv('Groceries_dataset.csv', parse_dates=['Date']) \
       .rename(columns={'Member_number': 'uid', 'Date': 'date', 'itemDescription': 'product'})

In [4]:
df.head()

Unnamed: 0,uid,date,product
0,1808,2015-07-21,tropical fruit
1,2552,2015-05-01,whole milk
2,2300,2015-09-19,pip fruit
3,1187,2015-12-12,other vegetables
4,3037,2015-01-02,whole milk


Скрипт для ускоренной записи датафрейма в базу данных PostgreSQL

In [5]:
import csv
from io import StringIO

def psql_insert_copy(table, conn, keys, data_iter):
    dbapi_conn = conn.connection
    
    with dbapi_conn.cursor() as cur:
        s_buf = StringIO()
        writer = csv.writer(s_buf)
        writer.writerows(data_iter)
        s_buf.seek(0)
        
        columns = ', '.join(f'"{key}"' for key in keys)
        if table.schema:
            table_name = f'{table.schema}.{table.name}'
        else:
            table_name = table.name
    
        sql = f'COPY {table_name} ({columns}) FROM STDIN WITH CSV'
        cur.copy_expert(sql=sql, file=s_buf)

In [6]:
df.to_sql('groceries', engine, if_exists='replace', method=psql_insert_copy)

Выделим сессии покупок для каждого пользователя

In [7]:
query = """
SELECT gr.uid, gr.date, gr.product, tr.transaction
FROM groceries AS gr, (
  SELECT
    uid
    , date
    , ROW_NUMBER() OVER(ORDER BY date, uid) AS transaction
  FROM groceries
  GROUP BY uid, date
) AS tr
WHERE gr.uid = tr.uid AND gr.date = tr.date;
"""
pd.read_sql(query, engine)

Unnamed: 0,uid,date,product,transaction
0,1808,2015-07-21,tropical fruit,11831
1,2552,2015-05-01,whole milk,10294
2,2300,2015-09-19,pip fruit,13019
3,1187,2015-12-12,other vegetables,14626
4,3037,2015-01-02,whole milk,8006
...,...,...,...,...
38760,4471,2014-08-10,sliced cheese,4902
38761,2022,2014-02-23,candy,1192
38762,1097,2014-04-16,cake bar,2305
38763,1510,2014-03-12,fruit/vegetable juice,1588


Опираясь на данные из таблицы транзакций и список продуктов-кандидатов, встречающихся в базе хотя бы 10 раз, строим таблицу, состояющую из декартова произведения идентификатора транзакции и уникальных продуктов из списка этой транзакции.

Для ускорения записи в БД используем вышезаписанный метод `psql_insert_copy`.

In [12]:
query = """
WITH transactions AS (
  SELECT gr.uid, gr.date, gr.product, tr.transaction
  FROM groceries AS gr, (
    SELECT
      uid
      , date
      , ROW_NUMBER() OVER(ORDER BY date, uid) AS transaction
    FROM groceries
    GROUP BY uid, date
  ) AS tr
  WHERE gr.uid = tr.uid AND gr.date = tr.date
),
candidates AS (
  SELECT product
  FROM transactions
  GROUP BY product
  HAVING count(*) > 10
)

--CREATE TEMP TABLE pairs AS
SELECT
  left_hand.transaction,
  left_hand.product AS left,
  right_hand.product AS right
FROM
  (SELECT product FROM candidates) AS c1,
  (SELECT product FROM candidates) AS c2,
  (SELECT DISTINCT transaction, product FROM transactions) AS left_hand,
  (SELECT DISTINCT transaction, product FROM transactions) AS right_hand
WHERE left_hand.transaction = right_hand.transaction
  AND left_hand.product <> right_hand.product
  AND left_hand.product = c1.product
  AND right_hand.product = c2.product;
"""
pd.read_sql(query, engine) \
  .to_sql('pairs', engine, if_exists='replace', method=psql_insert_copy)

Посчитаем, как частво встречаются вместе в таблице `pairs` составленные пары продуктов, чтобы сопоставить эти цифры с количеством уникальных транзакций и встречаемость отдельного «ведущего» продукта в паре. Эти соотношения и будут искомыми коэффициентами `support` и `confidence`, составляющими мертрику, описывающую совместную встречаемость пары продуктов в покупательской корзине.

In [14]:
query = """
WITH orders AS (
  SELECT
    gr.uid
    , gr.date
    , gr.product
    , tr.transaction
  FROM groceries AS gr, (
    SELECT
      uid
      , date
      , ROW_NUMBER() OVER(ORDER BY date, uid) AS transaction
    FROM groceries
    GROUP BY uid, date
  ) AS tr
  WHERE gr.uid = tr.uid
    AND gr.date = tr.date
),
supports AS (
  SELECT p.left, p.right, COUNT(*) AS both
  FROM pairs AS p
  GROUP BY 1, 2 
)

SELECT
  s.left, s.right
  , s.both::FLOAT / (SELECT COUNT(DISTINCT transaction) FROM orders) AS support
  , s.both::FLOAT / lc.count_left AS confidence
FROM supports AS s
JOIN (
  SELECT
    product
    , count(*) AS count_left
  FROM orders
  GROUP BY product
) AS lc
  ON s.left = lc.product
ORDER BY 3 DESC, 4 DESC;
"""
pairs = pd.read_sql(query, engine)

Пары с уровнем поддержки > 0.05

In [15]:
pairs.query('support > 0.005')

Unnamed: 0,left,right,support,confidence
0,other vegetables,whole milk,0.014837,0.116965
1,whole milk,other vegetables,0.014837,0.088729
2,rolls/buns,whole milk,0.013968,0.121795
3,whole milk,rolls/buns,0.013968,0.083533
4,soda,whole milk,0.011629,0.114927
...,...,...,...,...
69,yogurt,tropical fruit,0.005213,0.058471
70,frankfurter,other vegetables,0.005146,0.132759
71,other vegetables,frankfurter,0.005146,0.040569
72,pork,whole milk,0.005012,0.132509


Пары с одинаковым составом, но разным положением имеют одинаковый показатель `support`, но разный `confidence`. Что позволяет более точно выбрать ведущий товар в паре.

Попробуем найти наиболее частые пары на Python при помощи модуля `apriori` библиотеки `mlxtend`.

In [16]:
from mlxtend.frequent_patterns import apriori

In [17]:
dummy = pd.get_dummies(df['product'])
groceries = df.drop(['product'], axis=1).join(dummy)

In [18]:
products = groceries.columns.difference(set(['uid', 'date']))
data = groceries.groupby(['uid', 'date'], as_index=False)[products].sum() \
                .drop(columns=['uid', 'date'])
data.where(data < 1, 1, inplace=True) # data.applymap(lambda x: 0 if not x else 1)

In [19]:
frequent_items = apriori(data, min_support=0.005, max_len=2, use_colnames=True)
frequent_items.loc[frequent_items['itemsets'].apply(len) > 1].sort_values('support', ascending=False)

Unnamed: 0,support,itemsets
103,0.014837,"(other vegetables, whole milk)"
112,0.013968,"(rolls/buns, whole milk)"
121,0.011629,"(soda, whole milk)"
125,0.011161,"(yogurt, whole milk)"
98,0.010559,"(rolls/buns, other vegetables)"
101,0.009691,"(soda, other vegetables)"
117,0.008955,"(sausage, whole milk)"
123,0.00822,"(tropical fruit, whole milk)"
110,0.008087,"(rolls/buns, soda)"
104,0.008087,"(yogurt, other vegetables)"


Как можно видеть, результат идентичен с вариантом решения на SQL.