In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time

from gsp import *
from prefixspan import PrefixSpan

# Sequential pattern mining

In [2]:
data = pd.read_csv('datasets/clean_data.csv', index_col=0)

In [3]:
# Creating a dictionary for mapping the ProdIDs for the SPMF library.

prod_ids = data['ProdID'].unique()

prod_to_int = dict()  # Maps the ProdID to an integer.
i = 1
for prod_id in prod_ids:
    prod_to_int[prod_id] = i
    i += 1
    
int_to_prod = {v: k for k, v in prod_to_int.items()}  # Reversed dictionary.

data['ProdID_int'] = data['ProdID'].map(prod_to_int)

### Creating the shopping sequences

In [4]:
sequential_df = data.groupby(['CustomerID', 'BasketID'])['ProdID'].apply(list)
sequential_df_int = data.groupby(['CustomerID', 'BasketID'])['ProdID_int'].apply(list)

sequential_df = sequential_df.groupby(level=0).apply(list)
sequential_df_int = sequential_df_int.groupby(level=0).apply(list)

In [5]:
print("Number of input sequences:", len(sequential_df))
print("Total number of events:", sum([len(seq) for seq in sequential_df]))

myset = set()
for seq in sequential_df:
    for lista in seq:
        for elem in lista:
            myset.add(elem)

print("Number of distinct products:", len(myset))

Number of input sequences: 4333
Total number of events: 18400
Number of distinct products: 3658


## GSP algorithm

In [6]:
start_time = time.time()
result_set = apriori(sequential_df.head(100), 50, verbose=False)
print("%s seconds" % (time.time() - start_time))

1.5127949714660645 seconds


In [7]:
for r in result_set:
    if len(r[0]) > 1:  # At least 2 items.
        print(r[0])
        print(r[1])

## PrefixSpan algorithm

In [8]:
from spmf import Spmf

path = "/Users/michelezoncheddu/Downloads"

spmf = Spmf("PrefixSpan", input_direct=sequential_df_int.tolist(), arguments=[0.023],  # 2.3% = 100 items
            spmf_bin_location_dir=path)
spmf.run()

result = spmf.to_pandas_dataframe(pickle=True)

>/Users/michelezoncheddu/Downloads/spmf.jar
 Total time ~ 5865 ms
 Frequent sequences count : 2306
 Max memory (mb) : 188.1513671875
 minsup = 100 sequences.
 Pattern count : 2306




In [9]:
result[(result['pattern'].map(len) >= 2) & (result['sup'] >= 200)]

Unnamed: 0,pattern,sup
9,"[1, 1]",407
10,"[1, 1, 1]",224
94,"[14, 14]",291
270,"[66, 66]",276
297,"[66, 258]",207
310,"[66, 286]",206
323,"[66, 288]",212
338,"[66, 3008]",213
532,"[125, 125]",201
590,"[138, 66]",200
