In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time

from gsp import *
from spmf import Spmf

# Sequential pattern mining

In [2]:
data = pd.read_csv('datasets/clean_data.csv', index_col=0)

In [3]:
# Creating a dictionary for mapping the ProdIDs for the SPMF library.

prod_ids = data['ProdID'].unique()

prod_to_int = dict()  # Maps the ProdID to an integer.
i = 1
for prod_id in prod_ids:
    prod_to_int[prod_id] = i
    i += 1
    
int_to_prod = {v: k for k, v in prod_to_int.items()}  # Reversed dictionary.

data['ProdID_int'] = data['ProdID'].map(prod_to_int)

### Creating the shopping sequences

In [4]:
sequential_df = data.groupby(['CustomerID', 'BasketID'])['ProdID'].apply(list)
sequential_df_int = data.groupby(['CustomerID', 'BasketID'])['ProdID_int'].apply(list)

sequential_df = sequential_df.groupby(level=0).apply(list)
sequential_df_int = sequential_df_int.groupby(level=0).apply(list)

In [5]:
print("Number of input sequences:", len(sequential_df))
print("Total number of events:", sum([len(seq) for seq in sequential_df]))

myset = set()
for seq in sequential_df:
    for lista in seq:
        for elem in lista:
            myset.add(elem)

print("Number of distinct products:", len(myset))

Number of input sequences: 4333
Total number of events: 18400
Number of distinct products: 3658


In [6]:
sequential_df_int

CustomerID
12347    [[326, 1066, 894, 45, 85, 269, 270, 84, 409, 2...
12348    [[285, 414, 72, 72, 283, 283, 408, 696, 695, 1...
12349    [[3053, 3587, 1368, 144, 1084, 1044, 161, 305,...
12350    [[1253, 1575, 801, 1020, 1460, 2634, 422, 421,...
12352    [[936, 208, 207, 105, 141, 528, 174, 665, 23, ...
                               ...                        
18280    [[120, 187, 261, 888, 28, 1854, 438, 1034, 212...
18281             [[959, 309, 988, 2814, 2815, 3008, 261]]
18282    [[2867, 2808, 3291, 1269, 264, 1606, 132], [52...
18283    [[1203, 259, 260, 59, 355, 1211, 354, 138, 106...
18287    [[798, 568, 800, 1333, 1330, 1334, 6, 579, 512...
Name: ProdID_int, Length: 4333, dtype: object

## GSP algorithm

In [7]:
start_time = time.time()
result_set = apriori(sequential_df.head(100), 50, verbose=False)
print("%s seconds" % (time.time() - start_time))

1.3860788345336914 seconds


In [8]:
for r in result_set:
    if len(r[0]) > 1:  # At least 2 items.
        print(r[0])
        print(r[1])

## PrefixSpan algorithm

In [9]:
path = "/Users/michelezoncheddu/Downloads"

spmf = Spmf("PrefixSpan", input_direct=sequential_df_int.tolist(), arguments=[0.023],  # 2.3% = 100 items
            spmf_bin_location_dir=path)
spmf.run()

result = spmf.to_pandas_dataframe(pickle=True)

>/Users/michelezoncheddu/Downloads/spmf.jar
 Total time ~ 5804 ms
 Frequent sequences count : 2306
 Max memory (mb) : 169.270263671875
 minsup = 100 sequences.
 Pattern count : 2306




In [10]:
frequent_pattern = result[(result['pattern'].map(len) >= 2) & (result['sup'] >= 200)]
frequent_pattern

Unnamed: 0,pattern,sup
9,"[1, 1]",407
10,"[1, 1, 1]",224
94,"[14, 14]",291
270,"[66, 66]",276
297,"[66, 258]",207
310,"[66, 286]",206
323,"[66, 288]",212
338,"[66, 3008]",213
532,"[125, 125]",201
590,"[138, 66]",200


In [11]:
frequent_prod = list()
for basket in frequent_pattern['pattern']:
    temp = list()
    for l in basket:
        for x in l.split():
            temp.append(int_to_prod[int(x)])
    frequent_prod.append(temp)   

In [12]:
frequent_prod

[['85123A', '85123A'],
 ['85123A', '85123A', '85123A'],
 ['84879', '84879'],
 ['20725', '20725'],
 ['20725', '22382'],
 ['20725', '22383'],
 ['20725', '20727'],
 ['20725', '23209'],
 ['22469', '22469'],
 ['85099B', '20725'],
 ['85099B', '85099B'],
 ['85099B', '23203'],
 ['22197', '22197'],
 ['22384', '20725'],
 ['22383', '22383'],
 ['20728', '20725'],
 ['20728', '20728'],
 ['20727', '20725'],
 ['20727', '20727'],
 ['22423', '22423'],
 ['22423', '23245'],
 ['47566', '47566'],
 ['22720', '22720'],
 ['23298', '23298'],
 ['23209', '23209'],
 ['23203', '85099B'],
 ['23203', '23203']]