In [1]:
import numpy as np 
import pandas as pd 

In [2]:
! pip install fpgrowth_py

Collecting fpgrowth_py
  Downloading fpgrowth_py-1.0.0-py3-none-any.whl (5.6 kB)
Installing collected packages: fpgrowth_py
Successfully installed fpgrowth_py-1.0.0


In [22]:
from fpgrowth_py import fpgrowth
import time
from tqdm import tqdm
from collections import defaultdict
import heapq

# Data Preprocessing

In [3]:
itemset_item_trn = pd.read_csv(r'C:\Users\a3179\OneDrive\바탕 화면\datamining_termproject\dataset\itemset_item_training.csv', names = ['itemset', 'item'])
itemset_item_trn

Unnamed: 0,itemset,item
0,8571,18506
1,8571,33644
2,8571,32627
3,5725,14855
4,5725,28037
...,...,...
85817,21913,21835
85818,21913,37137
85819,21913,41653
85820,21913,1682


In [4]:
basket = itemset_item_trn.groupby(['itemset']).agg({'item': lambda s: list(set(s))}) # grouping items from the same itemset

print('Dimension of the new grouped dataset : ', basket.shape)
print('----------')
basket.head()

Dimension of the new grouped dataset :  (22156, 1)
----------


Unnamed: 0_level_0,item
itemset,Unnamed: 1_level_1
0,"[10176, 22555, 21460, 31044]"
2,"[3002, 17798, 33815]"
3,"[35243, 12117, 13117]"
6,"[37817, 6932, 25215, 25031]"
7,"[17648, 27897, 16915, 14190]"


# Association Rule Mining Based on FP-growth Algorithm

In [16]:
freqItem, rules = fpgrowth(basket['item'].values, minSupRatio=0.00001, minConf=0.3)

In [53]:
freqItem

[{31044},
 {21460, 31044},
 {10176, 31044},
 {10176, 21460, 31044},
 {22555, 31044},
 {10176, 22555, 31044},
 {21460, 22555, 31044},
 {10176, 21460, 22555, 31044},
 {17798},
 {3002, 17798},
 {17798, 33815},
 {3002, 17798, 33815},
 {35243},
 {12117, 35243},
 {13117, 35243},
 {12117, 13117, 35243},
 {25215},
 {6932, 25215},
 {25031, 25215},
 {6932, 25031, 25215},
 {25215, 37817},
 {25031, 25215, 37817},
 {6932, 25215, 37817},
 {6932, 25031, 25215, 37817},
 {16915},
 {16915, 17648},
 {16915, 27897},
 {16915, 17648, 27897},
 {14190, 16915},
 {14190, 16915, 27897},
 {14190, 16915, 17648},
 {14190, 16915, 17648, 27897},
 {20815},
 {20815, 22391},
 {7021, 20815},
 {7021, 20815, 22391},
 {20815, 40969},
 {7021, 20815, 40969},
 {20815, 22391, 40969},
 {7021, 20815, 22391, 40969},
 {22271},
 {20915, 22271},
 {5843, 22271},
 {5843, 20915, 22271},
 {15394},
 {15394, 40085},
 {5154},
 {5154, 15394},
 {5154, 40085},
 {5154, 15394, 40085},
 {522},
 {522, 5154},
 {522, 15394},
 {522, 5154, 15394},
 {5

In [49]:
association=pd.DataFrame(rules,columns =['basket','next_item','proba']) 
association=association.sort_values(by='proba',ascending=False)
print('Dimensions of the association table are : ', association.shape)
association

Dimensions of the association table are :  (864, 3)


Unnamed: 0,basket,next_item,proba
346,"{20538, 32199}",{21860},1.000000
544,"{42405, 3327}",{494},1.000000
140,"{13297, 32205}",{42035},1.000000
141,"{42035, 32205}",{13297},1.000000
661,"{23832, 33008}",{12996},1.000000
...,...,...,...
582,{31432},{2233},0.010526
314,{31432},{23169},0.010526
811,{31432},{784},0.010526
806,{31432},{1363},0.010526


# Item Generation for Validation Dataset

In [None]:
def recommend_items(frequent_items, association_rules, items, top_n):
    recommended_items = []
    
    # Association Rule 기반 추천
    matching_rules = association_rules[association_rules['precedents'].apply(lambda x: x.issubset(set(items)) or set(items).issubset(items))]
    matching_rules = matching_rules.sort_values('proba', ascending=False).head(top_n)
    
    for next_product in matching_rules['next_product']:
        recommended_items.extend(list(next_product))
        if len(recommended_items) >= top_n:
            break
        
    # Rule 기반 추천으로 N개를 채우지 못한 경우
    remaining_slots = top_n - len(recommended_items)
    if remaining_slots > 0:
        remaining_items = [item for item in frequent_items if item not in set(recommended_items)]
        # remaining_items = sorted(remaining_items, key=lambda x: frequent_items[x])
        
        for remaining_item in remaining_items:
            for element in remaining_item:
                recommended_items.append(element)
                if len(recommended_items) >= top_n:
                    break


    return recommended_items[:top_n]

In [17]:
itemset_item_val_q = pd.read_csv(r'C:\Users\a3179\OneDrive\바탕 화면\datamining_termproject\dataset\itemset_item_valid_query.csv', names = ['itemset', 'item'])
itemset_item_val_q

Unnamed: 0,itemset,item
0,6998,28322
1,6998,34217
2,6998,12058
3,8941,10176
4,8941,22364
...,...,...
7792,23768,11696
7793,23768,1738
7794,23768,38048
7795,6019,30882


In [18]:
val_basket = itemset_item_val_q.groupby(['itemset']).agg({'item': lambda s: list(set(s))}) # grouping items from the same itemset

print('Dimension of the new grouped dataset : ', val_basket.shape)
print('----------')
val_basket.head()

Dimension of the new grouped dataset :  (2769, 1)
----------


Unnamed: 0_level_0,item
itemset,Unnamed: 1_level_1
1,"[32102, 35006, 29189, 19558]"
4,"[32448, 1795, 12982]"
53,"[22078, 2350, 28628, 29102]"
55,"[10952, 5417, 40596]"
70,"[38824, 37154]"


In [19]:
val_items = val_basket['item']
val_items

itemset
1        [32102, 35006, 29189, 19558]
4                [32448, 1795, 12982]
53        [22078, 2350, 28628, 29102]
55               [10952, 5417, 40596]
70                     [38824, 37154]
                     ...             
27634            [14824, 5260, 32199]
27650            [3337, 20410, 15293]
27665                    [4371, 8580]
27670                  [41739, 38972]
27685    [37617, 23643, 13684, 37319]
Name: item, Length: 2769, dtype: object

In [27]:
top_n = 100
val_recommendation = []

for items in tqdm(val_items):
    recommended_items = recommend_items(freqItem, association, items, top_n)
    val_recommendation.append(recommended_items)

val_recommendation

100%|██████████| 2769/2769 [10:02<00:00,  4.59it/s]


In [28]:
val_recommendation

[[31432,
  34338,
  25194,
  19204,
  13979,
  27447,
  12942,
  7377,
  28955,
  36413,
  8487,
  14190,
  19090,
  4191,
  16926,
  784,
  27596,
  9574,
  16606,
  42035,
  1363,
  36689,
  34643,
  10434,
  231,
  16941,
  38939,
  40090,
  7876,
  13707,
  37284,
  38871,
  706,
  11280,
  12819,
  13708,
  39475,
  27897,
  40969,
  28867,
  41045,
  8525,
  22347,
  8007,
  25860,
  33644,
  38976,
  18481,
  4986,
  23507,
  14230,
  32842,
  26324,
  21506,
  21265,
  14084,
  8155,
  40395,
  35430,
  25186,
  14492,
  17187,
  12820,
  16118,
  6711,
  1554,
  26625,
  40884,
  27999,
  33642,
  6651,
  39609,
  37718,
  28586,
  4886,
  30085,
  35603,
  20422,
  42089,
  31941,
  27761,
  7945,
  18589,
  22392,
  12648,
  21011,
  19353,
  23867,
  2592,
  4725,
  38600,
  19547,
  6655,
  28189,
  7587,
  25264,
  37752,
  7530,
  14536,
  4626],
 [31432,
  34338,
  25194,
  19204,
  13979,
  27447,
  12942,
  7377,
  28955,
  36413,
  8487,
  14190,
  19090,
  4191,
  1

In [32]:
itemset_item_val_a = pd.read_csv(r'C:\Users\a3179\OneDrive\바탕 화면\datamining_termproject\dataset\itemset_item_valid_answer.csv', names = ['itemset', 'item'])
itemset_item_val_a

Unnamed: 0,itemset,item
0,6998,14100
1,8941,37713
2,11717,29703
3,18595,37718
4,23064,42559
...,...,...
2764,396,11105
2765,7525,35558
2766,14719,24249
2767,23768,10764


In [33]:
print(len(val_recommendation))
print(len(itemset_item_val_a))

2769
2769


In [47]:
val_ans = itemset_item_val_a['item']
val_acc = 0
for i in range(len(val_ans)):
    if val_ans[i] in val_recommendation[i]:
        val_acc += 1

val_acc = val_acc/len(val_ans)
print(f'validation accuracy: {val_acc}')

validation accuracy: 0.1105092091007584


In [48]:
val_avg_rank = 0
for i in range(len(val_ans)):
    if val_ans[i] in val_recommendation[i]:
        val_avg_rank += val_recommendation[i].index(val_ans[i])
    else:
        val_avg_rank += 101

val_avg_rank = val_avg_rank/len(val_ans)
print(f'average validation rank: {val_avg_rank}')

average validation rank: 93.5966052726616
