# Ассоциативные правила. Алгоритм Априори

В таблице содержится информация о покупках. Необходимо, воспользовавшись этими данными, выяснить, какие пары товаров пользователи чаще всего покупают вместе.

In [3]:
import pandas as pd
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori 

In [4]:
purchases = pd.read_csv('https://stepik.org/media/attachments/lesson/409319/test1_completed.csv')
purchases.head()

Unnamed: 0,id,Товар,Количество
0,17119,Лимон,1.1
1,17119,Лимон оранжевый,0.7
2,17119,Лук-порей,10.0
3,17119,Лук репчатый,2.5
4,17119,Малина свежая,1.0


In [6]:
purchases = purchases.rename({'Товар': 'goods', 'Количество': 'quantity'}, axis=1)

In [7]:
# переводим данные в сводную таблицу
df = pd.pivot_table(purchases, values='quantity',
                    index='id', columns='goods').fillna(0)

In [8]:
df

goods,Абрикос вяленый,Абрикосы молдавские,Авокадо ХАСС,Авокадо стандарт,Алыча вяленая,Ананас Gold,Ананасовые кольца,Апельсины столовые,Арбуз,Арбуз овальный,...,Яблоки Гала,Яблоки Голден,Яблоки Джонаголд,Яблоки Мутсу,Яблоки Симиренко,Яблоки Фуджи,Яблоки Чемпион,Яблоки сезонные,Яблоки сушеные,Ягоды Годжи
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0
17530,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0
17618,0.0,0.4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,1.7
17724,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,...,1.0,0.0,0.0,0.0,0.5,0.00,0.0,0.0,0.0,0.0
17814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0
119110,0.0,0.0,2.8,0.0,0.0,0.0,0.0,0.4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,1.0,0.0,0.0,1.7
119206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0
119393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0


In [9]:
# так как количество товара в данном случае не играет роли, перекодируем купленный товар/не купленный в True/False
df = df.applymap(lambda x: 0 if not x else 1)
df.head()

goods,Абрикос вяленый,Абрикосы молдавские,Авокадо ХАСС,Авокадо стандарт,Алыча вяленая,Ананас Gold,Ананасовые кольца,Апельсины столовые,Арбуз,Арбуз овальный,...,Яблоки Гала,Яблоки Голден,Яблоки Джонаголд,Яблоки Мутсу,Яблоки Симиренко,Яблоки Фуджи,Яблоки Чемпион,Яблоки сезонные,Яблоки сушеные,Ягоды Годжи
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17119,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17530,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17618,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
17724,0,1,0,0,0,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
17814,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
transactions_count = df.size

In [14]:
# используя модуль apriori ищем наиболее встречаемые пары товаров 
frequent_items = apriori(df, min_support=0.05, use_colnames=True)

# так как модуль рассматривает все возможные комбинации, отбираем только те данные, которые для пар товаров
frequent_items['length'] = frequent_items['itemsets'].apply(len)
frequent_items = frequent_items.query('length == 2')

frequent_items.head()

Unnamed: 0,support,itemsets,length
109,0.0608,"(Абрикосы молдавские, Огурцы Луховицкие)",2
110,0.05469,"(Абрикосы молдавские, Укроп)",2
111,0.058356,"(Баклажаны грунтовые, Арбуз)",2
112,0.072411,"(Арбуз, Бананы)",2
113,0.069966,"(Арбуз, Кабачки)",2


In [15]:
# создаем столбик frequency, как произведение support на количество транзакций
frequent_items['frequency'] = frequent_items['support'] * transactions_count
frequent_items = frequent_items.drop(columns=['length', 'support'])

In [16]:
frequent_items = frequent_items.sort_values('frequency', ascending=False)

In [17]:
frequent_items = pd.DataFrame(frequent_items).reset_index(drop=True)
frequent_items[['good_1', 'good_2']] = pd.DataFrame(frequent_items.itemsets.to_list())
frequent_items = frequent_items.drop(columns='itemsets')

In [21]:
frequent_items

Unnamed: 0,frequency,good_1,good_2
0,85769.0,Огурцы Луховицкие,Укроп
1,81192.0,Укроп,Петрушка
2,68655.0,Огурцы Луховицкие,Арбуз
3,64874.0,Огурцы Луховицкие,Кабачки
4,60297.0,Укроп,Кинза
5,59700.0,Укроп,Лук зеленый
6,56914.0,Огурцы Луховицкие,Петрушка
7,56715.0,Огурцы Луховицкие,Лук репчатый
8,56516.0,Баклажаны грунтовые,Кабачки
9,55919.0,Укроп,Кабачки


In [22]:
# отбираем топ-10 пар товаров
frequent_itemsets_top = frequent_itemsets.sort_values('frequency', ascending=False).head(10)
frequent_itemsets_top

Unnamed: 0,itemsets,frequency
153,"(Огурцы Луховицкие, Укроп)",85769.0
158,"(Укроп, Петрушка)",81192.0
117,"(Огурцы Луховицкие, Арбуз)",68655.0
134,"(Огурцы Луховицкие, Кабачки)",64874.0
139,"(Укроп, Кинза)",60297.0
144,"(Укроп, Лук зеленый)",59700.0
151,"(Огурцы Луховицкие, Петрушка)",56914.0
145,"(Огурцы Луховицкие, Лук репчатый)",56715.0
123,"(Баклажаны грунтовые, Кабачки)",56516.0
136,"(Укроп, Кабачки)",55919.0


# Выводы

* Наиболее часто встречаемые пары товаров состоят преимущественно из овощей, самая распространенная пара это "огурцы и укроп". Вероятно это связано с тем, что это самые распространенные ингредиенты для салата.

* В топ-5 пар попал паттерн "огурцы и арбуз", скорее всего это связано с сезонностью.

* Также стоит отметить, что чаще всего одним из товаров в паре являются "огурцы", что можно объяснить тем, что данный овощ является самым распространенным в продуктовой корзине в принципе.

Подобный анализ продуктовой корзины позволяет оптимизировать размещение товаров в магазине, для увеличения выручки и удобства покупателя, если рассматривать онлайн-магазин, то полученные данные можно использовать для выдачи определенных рекомендаций пользователю в зависимости от продуктов в его корзине.
