# Task 4: Sequential Pattern Mining

In [1]:
import re
import copy
import numpy as np
import pandas as pd
from spmf import Spmf
from operator import neg

In [2]:
def spmf_encode_with_timestamp(dataset):
    """
    e.g.
    
    dataset = [
        # sequence: list of events
        [(0, ['a']), (1, ['a', 'b', 'c']), (2, ['a', 'c']), (3, ['c'])],  # event: (timestamp : [list of strings])
        [(0, ['a']), (1, ['c']), (2, ['b', 'c'])], 
        [(0, ['a', 'b']), (1, ['d']), (2, ['c']), (3, ['b']), (4, ['c'])], 
        [(0, ['a']), (1, ['c']), (2, ['b']), (3, ['c'])]
    ]
    
    @CONVERTED_FROM_TEXT
    @ITEM=0=a
    @ITEM=1=b
    @ITEM=2=c
    @ITEM=3=d
    <0> 0 -1 <1> 0 1 2 -1 <2> 0 2 -1 <3> 2 -1 -2
    <0> 0 -1 <1> 2 -1 <2> 1 2 -1 -2
    <0> 0 1 -1 <1> 3 -1 <2> 2 -1 <3> 1 -1 <4> 2 -1 -2
    <0> 0 -1 <1> 2 -1 <2> 1 -1 <3> 2 -1 -2
    """
    
    items = sorted(set(item.replace(' ', '_') for sequence in dataset 
                       for event in sequence 
                       for item in (event[1] if isinstance(event, tuple) else event)))

    labels_dict = dict(zip(items, range(len(items))))

    spmf_str = '@CONVERTED_FROM_TEXT' + '\n'
    for item, idx in labels_dict.items():
        spmf_str += '@ITEM=' + str(idx) + '=' + item + '\n'

    for sequence in dataset:
        for t, event in sequence if isinstance(sequence[0], tuple) else enumerate(sequence):
            spmf_str += '<' + str(t) + '>' + ' '
            for item in event:
                spmf_str += str(labels_dict[item.replace(' ', '_')]) + ' '
            spmf_str += '-1' + ' '
        spmf_str += '-2' + '\n'
    return spmf_str

## GSP: Apriori-based Sequential Pattern Mining

In [3]:
dataset = [
    # sequence: list of events
    [['a'], ['a', 'b', 'c'], ['a', 'c'], ['c']],  # event: [list of strings]
    [['a'], ['c'], ['b', 'c']], 
    [['a', 'b'], ['d'], ['c'], ['b'], ['c']], 
    [['a'], ['c'], ['b'], ['c']]
]

In [4]:
def is_subsequence(main_sequence, subsequence):
    """
    Recursive method that checks if `subsequence` is a subsequence of `main_sequence`
    """

    def is_subsequence_recursive(subsequence_clone, start=0):
        """
        Function for the recursive call of is_subsequence
        """
        # check if empty: end of recursion, all itemsets have been found
        if not subsequence_clone:
            return True
        # retrieves element of the subsequence and removes is from subsequence 
        first_elem = set(subsequence_clone.pop(0))
        # search for the first itemset...
        for i in range(start, len(main_sequence)):
            if set(main_sequence[i]).issuperset(first_elem):
                # and recurse
                return is_subsequence_recursive(subsequence_clone, i + 1)
        return False

    return is_subsequence_recursive(subsequence.copy()) # start recursion

In [5]:
sequence = [['a'], ['b', 'c'], ['d'], ['a', 'e']]

In [6]:
is_subsequence(sequence, [['a'], ['b', 'c'], ['e']])

True

In [7]:
is_subsequence(sequence, [['a'], ['b', 'd']])

False

In [8]:
def sequence_length(sequence):
    """
    Computes the length of the sequence (sum of the length of the contained itemsets)
    """
    return sum(len(i) for i in sequence)

In [9]:
sequence_length([['a'], ['b', 'c'], ['a'], ['b', 'c', 'd']])

7

In [10]:
def supports(sequence, cand_seq, max_span=np.inf, min_gap=0, max_gap=np.inf):
    for idx, event in enumerate(sequence):
        i = 0
        if set(event[1] if isinstance(event, tuple) else event).issuperset(cand_seq[i]):
            min_t = event[0] if isinstance(event, tuple) else idx
            i += 1

            # special case if cand_seq is a sequence of one element
            if i == len(cand_seq):
                return True

            prev_t = event[0] if isinstance(event, tuple) else idx

            for t, itemset in (sequence[idx + 1:] if isinstance(sequence[idx], tuple) 
                               else enumerate(sequence[idx + 1:], start=idx + 1)):

                # the min_gap constraint is violated
                if not t - prev_t > min_gap:
                    continue

                # the max_gap constraint is violated
                if not t - prev_t <= max_gap:
                    break

                # the max_span constraint is violated
                if t - min_t > max_span:
                    break

                if set(itemset).issuperset(cand_seq[i]):
                    i += 1

                # the sequence satisfies all the time constraints
                if i == len(cand_seq):
                    return True
    return False

Examples from Kumar's book, page 437:

In [11]:
sequence = [[1, 3], [3, 4], [4], [5], [6, 7], [8]]

In [12]:
print(supports(sequence, [[3], [4]], max_span=3))
print(supports(sequence, [[3], [6]], max_span=3))
print(supports(sequence, [[1, 3], [6]], max_span=3))

True
True
False


In [13]:
print(supports(sequence, [[3], [6]], min_gap=1))
print(supports(sequence, [[3], [6]], max_gap=3))

True
True


In [14]:
print(supports(sequence, [[6], [8]], min_gap=1))
print(supports(sequence, [[6], [8]], max_gap=3))

False
True


In [15]:
print(supports(sequence, [[1, 3], [6]], min_gap=1))
print(supports(sequence, [[1, 3], [6]], max_gap=3))

True
False


In [16]:
print(supports(sequence, [[1], [3], [8]], min_gap=1))
print(supports(sequence, [[1], [3], [8]], max_gap=3))

False
False


Examples from [Kumar's book slides](https://www-users.cs.umn.edu/~kumar001/dmbook/slides/chap6_advanced_association_analysis.pdf#page=25), slide 50:

In [17]:
supports([[2, 4], [3, 5, 6], [4, 7], [4, 5], [8]], [[6], [5]], max_span=4, min_gap=0, max_gap=2)

True

In [18]:
supports([[1], [2], [3], [4], [5]], [[1], [4]], max_span=4, min_gap=0, max_gap=2)

False

In [19]:
supports([[1], [2, 3], [3, 4], [4, 5]], [[2], [3], [5]], max_span=4, min_gap=0, max_gap=2)

True

In [20]:
supports([[1, 2], [3], [2, 3], [3, 4], [2, 4], (6, [4, 5])], [[1, 2], [5]], max_span=4, min_gap=0, max_gap=2)

False

In [21]:
def count_support(dataset, cand_seq, max_span=np.inf, min_gap=0, max_gap=np.inf):
    """
    Computes the support of a sequence in a dataset provided time constraints
    """
    if max_span == np.inf and min_gap == 0 and max_gap == np.inf: # no time constraints
        return sum(1 for seq in dataset if is_subsequence([event[1] for event in seq] if isinstance(seq[0], tuple) else seq, cand_seq))
    else:
        return sum(1 for seq in dataset if supports(seq, cand_seq, max_span, min_gap, max_gap))

In [22]:
count_support(dataset, [['b']])

4

In [23]:
count_support(dataset, [['a'], ['b', 'c']])

2

In [24]:
def gen_cands_for_pair(cand1, cand2):
    """
    Generates one candidate of length k from two candidates of length (k-1)
    """
    cand1_clone = copy.deepcopy(cand1)
    cand2_clone = copy.deepcopy(cand2)
    # drop the leftmost item from cand1:
    if len(cand1[0]) == 1:
        cand1_clone.pop(0)
    else:
        cand1_clone[0] = cand1_clone[0][1:]
    # drop the rightmost item from cand2:
    if len(cand2[-1]) == 1:
        cand2_clone.pop(-1)
    else:
        cand2_clone[-1] = cand2_clone[-1][:-1]
    
    # if the result is not the same, then we dont need to join
    if not cand1_clone == cand2_clone:
        return []
    else:
        new_cand = copy.deepcopy(cand1)
        if len(cand2[-1]) == 1:
            new_cand.append(cand2[-1])
        else:
            new_cand[-1].extend([cand2[-1][-1]])
        return new_cand

In [25]:
candA = [['a'], ['b', 'c'], ['d']]
candB = [['b', 'c'], ['d', 'e']]
gen_cands_for_pair(candA, candB)

[['a'], ['b', 'c'], ['d', 'e']]

In [26]:
candA = [['a'], ['b', 'c'], ['d']]
candC = [['b', 'c'], ['d'], ['e']]
gen_cands_for_pair(candA, candC)

[['a'], ['b', 'c'], ['d'], ['e']]

In [27]:
candA = [['a'], ['b', 'c'], ['d']]
candD = [['a'], ['b', 'c'], ['e']]
gen_cands_for_pair(candA, candD)

[]

In [28]:
def gen_cands(last_lvl_cands):
    """
    Generates the set of candidates of length k from the set of frequent sequences with length (k-1)
    """
    k = sequence_length(last_lvl_cands[0]) + 1
    if k == 2:
        flat_short_cands = [item for sublist2 in last_lvl_cands for sublist1 in sublist2 for item in sublist1]
        result = [[[a, b]] for a in flat_short_cands for b in flat_short_cands if b > a]
        result.extend([[[a], [b]] for a in flat_short_cands for b in flat_short_cands])
        return result
    else:
        cands = []
        for i in range(0, len(last_lvl_cands)):
            for j in range(0, len(last_lvl_cands)):
                new_cand = gen_cands_for_pair(last_lvl_cands[i], last_lvl_cands[j])
                if not new_cand == []:
                    cands.append(new_cand)
        cands.sort()
        return cands

Lets assume we know the frequent sequences of level 2:

In [29]:
last_lvl_freq_patterns = [
    [['a', 'b']], 
    [['b', 'c']], 
    [['a'], ['b']], 
    [['a'], ['c']], 
    [['b'], ['c']], 
    [['c'], ['b']], 
    [['c'], ['c']]
]

Then we can compute the generate candidates for level 3:

In [30]:
new_cands = gen_cands(last_lvl_freq_patterns)
new_cands

[[['a'], ['b'], ['c']],
 [['a'], ['b', 'c']],
 [['a'], ['c'], ['b']],
 [['a'], ['c'], ['c']],
 [['a', 'b'], ['c']],
 [['a', 'b', 'c']],
 [['b'], ['c'], ['b']],
 [['b'], ['c'], ['c']],
 [['b', 'c'], ['b']],
 [['b', 'c'], ['c']],
 [['c'], ['b'], ['c']],
 [['c'], ['b', 'c']],
 [['c'], ['c'], ['b']],
 [['c'], ['c'], ['c']]]

In [31]:
def gen_direct_subsequences(sequence):
    """
    Computes all direct subsequence for a given sequence.
    A direct subsequence is any sequence that originates from deleting exactly one item from any event in the original sequence.
    """
    result = []
    for i, itemset in enumerate(sequence):
        if len(itemset) == 1:
            seq_clone = copy.deepcopy(sequence)
            seq_clone.pop(i)
            result.append(seq_clone)
        else:
            for j in range(len(itemset)):
                seq_clone = copy.deepcopy(sequence)
                seq_clone[i].pop(j)
                result.append(seq_clone)
    return result

In [32]:
def gen_contiguous_direct_subsequences(sequence):
    """
    Computes all direct contiguous subsequence for a given sequence.
    """
    result = []
    for i, itemset in enumerate(sequence):
        # first or last element
        if i == 0 or i == len(sequence) - 1:
            if len(itemset) == 1:
                seq_clone = copy.deepcopy(sequence)
                seq_clone.pop(i)
                result.append(seq_clone)
            else:
                for j in range(len(itemset)):
                    seq_clone = copy.deepcopy(sequence)
                    seq_clone[i].pop(j)
                    result.append(seq_clone)
        else:  # middle element
            if len(itemset) > 1:
                for j in range(len(itemset)):
                    seq_clone = copy.deepcopy(sequence)
                    seq_clone[i].pop(j)
                    result.append(seq_clone)
    return result

In [33]:
def prune_cands(last_lvl_cands, cands_gen, max_gap=np.inf):
    """
    Prunes the set of (contiguous) candidates generated for length k given all frequent sequence of level (k-1).
    Without maxgap constraint: a candidate k-sequence is pruned if at least one of its (k-1)-subsequences is infrequent.
    With maxgap constraint: a candidate k-sequence is pruned if at least one of its contiguous (k-1)-subsequences is infrequent.
    """
    return [cand for cand in cands_gen if all(x in last_lvl_cands for x in (gen_contiguous_direct_subsequences(cand) if max_gap != np.inf 
                                                                            else gen_direct_subsequences(cand)))]

We apply this on example dataset:

In [34]:
cands_pruned = prune_cands(last_lvl_freq_patterns, new_cands)
cands_pruned

[[['a'], ['b'], ['c']],
 [['a'], ['b', 'c']],
 [['a'], ['c'], ['b']],
 [['a'], ['c'], ['c']],
 [['a', 'b'], ['c']],
 [['b'], ['c'], ['c']],
 [['b', 'c'], ['c']],
 [['c'], ['b'], ['c']],
 [['c'], ['b', 'c']],
 [['c'], ['c'], ['b']],
 [['c'], ['c'], ['c']]]

In [35]:
min_sup = 0.5
cands_counts = [(s, count_support(dataset, s)) for s in cands_pruned]
result_lvl = [(i, count) for i, count in cands_counts if count >= min_sup * len(dataset)]
result_lvl

[([['a'], ['b'], ['c']], 3),
 ([['a'], ['b', 'c']], 2),
 ([['a'], ['c'], ['b']], 3),
 ([['a'], ['c'], ['c']], 4),
 ([['a', 'b'], ['c']], 2),
 ([['b'], ['c'], ['c']], 2),
 ([['c'], ['b'], ['c']], 2)]

In [36]:
def gsp(dataset, min_sup, max_span=np.inf, min_gap=0, max_gap=np.inf, verbose=False):
    """
    The Generalized Sequential Pattern (GSP) algorithm with time constraints. 
    Computes the frequent sequences in a sequence dataset.

    Args:
        dataset: a list of sequences, for which the frequent (sub-)sequences are computed
        min_sup: the minimum support that makes a sequence frequent
        max_span: this constraint specifies the maximum allowed time difference in days between the latest 
                  and the earliest occurrences of events in the entire sequence
        min_gap: this constraint specifies the minimum allowed time difference in days between the latest 
                 and the earliest element of the pattern instance
        max_gap: this constraint specifies the maximum allowed time difference in days between the latest 
                 and the earliest element of the pattern instance
        verbose: if True, additional information on the mining process are printed (i.e., results 
                 for each level if is 1, candidates generated and pruned at each level if is 2)

    Returns:
        A list of tuples (s, c), where s is a frequent sequence and c is the count for that sequence
    """
    overall = []
    min_sup *= len(dataset)
    # make the first pass over the sequence database to yield all the 1-element frequent subsequences
    items = sorted(set([item for sequence in dataset
                        for event in sequence
                        for item in (event[1] if isinstance(event, tuple) else event)]))
    single_item_sequences = [[[item]] for item in items]
    single_item_counts = [(s, count_support(dataset, s)) for s in single_item_sequences]
    single_item_counts = [(i, count) for i, count in single_item_counts if count >= min_sup]
    overall.append(single_item_counts)
    if verbose > 0:
        print('Result, lvl 1: ' + str(overall[0]))
    k = 1
    while overall[k - 1]:
        # 1. candidate generation: merge pairs of frequent subsequences found in the 
        # (k-1)th pass to generate candidate sequences that contain k items 
        last_lvl_cands = [x[0] for x in overall[k - 1]]
        cands_gen = gen_cands(last_lvl_cands)
        # 2. candidate pruning: prune candidate k-sequences that contain infrequent 
        # (contiguous) (k-1)-subsequences (Apriori principle)
        cands_pruned = prune_cands(last_lvl_cands, cands_gen, max_gap)
        # 3. support counting: make a new pass over the sequence database to find 
        # the support for these candidate sequences
        cands_counts = [(s, count_support(dataset, s, max_span, min_gap, max_gap)) for s in cands_pruned]
        # 4. candidate elimination: eliminate candidate k-sequences whose actual 
        # support is less than `minsup`
        result_lvl = [(i, count) for i, count in cands_counts if count >= min_sup]
        if verbose > 0:
            print('Result, lvl ' + str(k + 1) + ': ' + str(result_lvl))
            if verbose > 1:
                print('Candidates generated, lvl ' + str(k + 1) + ': ' + str(cands_gen))
                print('Candidates pruned, lvl ' + str(k + 1) + ': ' + str(cands_pruned))
        overall.append(result_lvl)
        k += 1
    # "flatten" overall
    overall = overall[:-1]
    overall = [item for sublist in overall for item in sublist]
    overall.sort(key=lambda tup: (tup[1], neg(sequence_length(tup[0]))), reverse=True)
    return overall

In [37]:
gsp(dataset, min_sup=0.5, verbose=2)

Result, lvl 1: [([['a']], 4), ([['b']], 4), ([['c']], 4)]
Result, lvl 2: [([['a', 'b']], 2), ([['b', 'c']], 2), ([['a'], ['b']], 4), ([['a'], ['c']], 4), ([['b'], ['c']], 3), ([['c'], ['b']], 3), ([['c'], ['c']], 4)]
Candidates generated, lvl 2: [[['a', 'b']], [['a', 'c']], [['b', 'c']], [['a'], ['a']], [['a'], ['b']], [['a'], ['c']], [['b'], ['a']], [['b'], ['b']], [['b'], ['c']], [['c'], ['a']], [['c'], ['b']], [['c'], ['c']]]
Candidates pruned, lvl 2: [[['a', 'b']], [['a', 'c']], [['b', 'c']], [['a'], ['a']], [['a'], ['b']], [['a'], ['c']], [['b'], ['a']], [['b'], ['b']], [['b'], ['c']], [['c'], ['a']], [['c'], ['b']], [['c'], ['c']]]
Result, lvl 3: [([['a'], ['b'], ['c']], 3), ([['a'], ['b', 'c']], 2), ([['a'], ['c'], ['b']], 3), ([['a'], ['c'], ['c']], 4), ([['a', 'b'], ['c']], 2), ([['b'], ['c'], ['c']], 2), ([['c'], ['b'], ['c']], 2)]
Candidates generated, lvl 3: [[['a'], ['b'], ['c']], [['a'], ['b', 'c']], [['a'], ['c'], ['b']], [['a'], ['c'], ['c']], [['a', 'b'], ['c']], [['a'

[([['a']], 4),
 ([['b']], 4),
 ([['c']], 4),
 ([['a'], ['b']], 4),
 ([['a'], ['c']], 4),
 ([['c'], ['c']], 4),
 ([['a'], ['c'], ['c']], 4),
 ([['b'], ['c']], 3),
 ([['c'], ['b']], 3),
 ([['a'], ['b'], ['c']], 3),
 ([['a'], ['c'], ['b']], 3),
 ([['a', 'b']], 2),
 ([['b', 'c']], 2),
 ([['a'], ['b', 'c']], 2),
 ([['a', 'b'], ['c']], 2),
 ([['b'], ['c'], ['c']], 2),
 ([['c'], ['b'], ['c']], 2),
 ([['a'], ['c'], ['b'], ['c']], 2),
 ([['a', 'b'], ['c'], ['c']], 2)]

In [38]:
spmf_dataset = spmf_encode_with_timestamp(dataset)
print(spmf_dataset)

@CONVERTED_FROM_TEXT
@ITEM=0=a
@ITEM=1=b
@ITEM=2=c
@ITEM=3=d
<0> 0 -1 <1> 0 1 2 -1 <2> 0 2 -1 <3> 2 -1 -2
<0> 0 -1 <1> 2 -1 <2> 1 2 -1 -2
<0> 0 1 -1 <1> 3 -1 <2> 2 -1 <3> 1 -1 <4> 2 -1 -2
<0> 0 -1 <1> 2 -1 <2> 1 -1 <3> 2 -1 -2



In [39]:
spmf = Spmf('GSP', input_direct=spmf_dataset, arguments=[0.5])  # min_sup
spmf.run()
freq_patterns = spmf.to_pandas_dataframe()
freq_patterns = [([event.split() for event in sequence], sup) 
                 for sequence, sup in zip(freq_patterns.pattern, freq_patterns.sup)]
freq_patterns.sort(key=lambda tup: (tup[1], neg(sequence_length(tup[0]))), reverse=True)
freq_patterns

>/home/donato/donato.meoli.95@gmail.com/DataMiningUniPi/DM_Group18_TASK4/spmf.jar
 Total time ~ 5 ms
 Frequent sequences count : 19
 Max memory (mb):12.550148010253906

Post-processing to show result in terms of string values.
Post-processing completed.



[([['a']], 4),
 ([['b']], 4),
 ([['c']], 4),
 ([['a'], ['b']], 4),
 ([['a'], ['c']], 4),
 ([['c'], ['c']], 4),
 ([['a'], ['c'], ['c']], 4),
 ([['b'], ['c']], 3),
 ([['c'], ['b']], 3),
 ([['a'], ['b'], ['c']], 3),
 ([['a'], ['c'], ['b']], 3),
 ([['a', 'b']], 2),
 ([['b', 'c']], 2),
 ([['a'], ['b', 'c']], 2),
 ([['a', 'b'], ['c']], 2),
 ([['b'], ['c'], ['c']], 2),
 ([['c'], ['b'], ['c']], 2),
 ([['a', 'b'], ['c'], ['c']], 2),
 ([['a'], ['c'], ['b'], ['c']], 2)]

## Loading the new Customer Supermarket dataset

In [40]:
df = pd.read_csv('../dataset/new_customer_supermarket.csv', sep='\t', index_col=0)
df

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,ProdID,ProdDescr,Qta,TotSale
0,539993,2011-04-01 10:00:00,1.95,13313.0,22386,JUMBO BAG PINK POLKADOT,10,19.50
1,539993,2011-04-01 10:00:00,0.42,13313.0,21499,BLUE POLKADOT WRAP,25,10.50
2,539993,2011-04-01 10:00:00,0.42,13313.0,21498,RED RETROSPOT WRAP,25,10.50
3,539993,2011-04-01 10:00:00,2.10,13313.0,22379,RECYCLING BAG RETROSPOT,5,10.50
4,539993,2011-04-01 10:00:00,1.25,13313.0,20718,RED RETROSPOT SHOPPER BAG,10,12.50
...,...,...,...,...,...,...,...,...
363571,581587,2011-09-12 12:50:00,0.85,12680.0,22613,PACK OF SPACEBOY NAPKINS,12,10.20
363572,581587,2011-09-12 12:50:00,2.10,12680.0,22899,CHILDRENS APRON DOLLY GIRL,6,12.60
363573,581587,2011-09-12 12:50:00,4.15,12680.0,23254,CHILDRENS CUTLERY DOLLY GIRL,4,16.60
363574,581587,2011-09-12 12:50:00,4.15,12680.0,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,16.60


In [41]:
df.dtypes

BasketID        int64
BasketDate     object
Sale          float64
CustomerID    float64
ProdID         object
ProdDescr      object
Qta             int64
TotSale       float64
dtype: object

In [42]:
df = df.astype({'BasketDate': 'datetime64',
                'BasketID': 'object',
                'CustomerID': 'object'})

## Data Modeling

In [43]:
df.sort_values('BasketDate', inplace=True)
df['BasketDayOfYear'] = df['BasketDate'].dt.dayofyear
df

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,ProdID,ProdDescr,Qta,TotSale,BasketDayOfYear
20767,542776,2011-01-02 08:23:00,0.30,15240.0,17021,NAMASTE SWAGAT INCENSE,36,10.80,2
20769,542776,2011-01-02 08:23:00,4.95,15240.0,21485,RETROSPOT HEART HOT WATER BOTTLE,3,14.85,2
20768,542776,2011-01-02 08:23:00,3.75,15240.0,21218,RED SPOTTY BISCUIT TIN,5,18.75,2
20766,542776,2011-01-02 08:23:00,2.95,15240.0,22083,PAPER CHAIN KIT RETROSPOT,12,35.40,2
20765,542776,2011-01-02 08:23:00,4.65,15240.0,22835,HOT WATER BOTTLE I AM SO POORLY,4,18.60,2
...,...,...,...,...,...,...,...,...,...
254281,570876,2011-12-10 17:19:00,3.75,16085.0,23394,POSTE FRANCE CUSHION COVER,1,3.75,344
254280,570876,2011-12-10 17:19:00,1.55,16085.0,46000M,POLYESTER FILLER PAD,1,1.55,344
254279,570876,2011-12-10 17:19:00,1.45,16085.0,46000S,POLYESTER FILLER PAD,1,1.45,344
254291,570876,2011-12-10 17:19:00,1.65,16085.0,22469,HEART OF WICKER SMALL,3,4.95,344


In [68]:
baskets_sequences = df.groupby('CustomerID').apply(lambda customer: customer.groupby('BasketDayOfYear')['ProdDescr'].unique().to_dict())
baskets_sequences = [[(k, v) for k, v in baskets.items()] for baskets in baskets_sequences]
baskets_sequences = list(filter(lambda sequence: len(sequence) > 1, baskets_sequences))  # filtering out some customers
baskets_sequences[:1]

[[(26,
   array(['PACK OF SPACEBOY CAKE CASES', 'TEA TIME OVEN GLOVE',
          'RED RETROSPOT OVEN GLOVE', 'RED RETROSPOT OVEN GLOVE DOUBLE',
          'SET RED RETROSPOT TEA TOWELS', 'REGENCY CAKESTAND TIER',
          'TOOTHPASTE TUBE PEN', 'MINI LADLE LOVE HEART RED',
          'CHOCOLATE CALCULATOR', 'SET OF TINS VINTAGE BATHROOM',
          'RED TOADSTOOL LED NIGHT LIGHT', 'DOG PICTURE PLAYING CARDS',
          'BOX OF ASSORTED COLOUR TEASPOONS', 'TEATIME FAIRY CAKE CASES',
          'PACK OF MUSHROOM CAKE CASES', 'SMALL HEART MEASURING SPOONS',
          'SWEETHEART FAIRY CAKE CASES',
          'BLUE NEW BAROQUE CANDLESTICK CANDLE',
          'BLACK CANDELABRA TLIGHT HOLDER', 'WOODLAND CHARLOTTE BAG',
          'AIRLINE BAG VINTAGE JET SET BROWN',
          'AIRLINE BAG VINTAGE JET SET WHITE',
          'PINK NEW BAROQUECANDLESTICK CANDLE',
          'ALARM CLOCK BAKELIKE CHOCOLATE', 'ALARM CLOCK BAKELIKE GREEN',
          'ALARM CLOCK BAKELIKE RED', 'ALARM CLOCK BAKELIKE PINK'

Custom implementation without time constraints:

In [69]:
%time custom_freq_patterns = gsp(baskets_sequences, min_sup=0.07, verbose=False)
list(filter(lambda freq_pattern: sequence_length(freq_pattern[0]) >= 2, custom_freq_patterns))

CPU times: user 47min 59s, sys: 132 ms, total: 47min 59s
Wall time: 47min 59s


[([['WHITE HANGING HEART TLIGHT HOLDER'],
   ['WHITE HANGING HEART TLIGHT HOLDER']],
  366),
 ([['JUMBO BAG RED RETROSPOT'], ['JUMBO BAG RED RETROSPOT']], 301),
 ([['REGENCY CAKESTAND TIER'], ['REGENCY CAKESTAND TIER']], 295),
 ([['ASSORTED COLOUR BIRD ORNAMENT'], ['ASSORTED COLOUR BIRD ORNAMENT']], 274),
 ([['PARTY BUNTING'], ['PARTY BUNTING']], 272),
 ([['LUNCH BAG RED RETROSPOT'], ['LUNCH BAG RED RETROSPOT']], 266),
 ([['GREEN REGENCY TEACUP AND SAUCER', 'ROSES REGENCY TEACUP AND SAUCER']],
  254),
 ([['PAPER CHAIN KIT CHRISTMAS', 'PAPER CHAIN KIT VINTAGE CHRISTMAS']], 247),
 ([['GREEN REGENCY TEACUP AND SAUCER', 'PINK REGENCY TEACUP AND SAUCER']],
  239),
 ([['LUNCH BAG PINK POLKADOT', 'LUNCH BAG RED RETROSPOT']], 231),
 ([['LUNCH BAG BLACK SKULL', 'LUNCH BAG RED RETROSPOT']], 228),
 ([['LUNCH BAG CARS BLUE', 'LUNCH BAG RED RETROSPOT']], 225),
 ([['PARTY BUNTING', 'SPOTTY BUNTING']], 223),
 ([['GARDENERS KNEELING PAD CUP OF TEA', 'GARDENERS KNEELING PAD KEEP CALM']],
  222),
 ([['H

SPMF implementation without time constraints:

In [71]:
spmf_baskets_sequences = spmf_encode_with_timestamp(baskets_sequences)
spmf = Spmf('HirateYamana', input_direct=spmf_baskets_sequences,  # http://www.philippe-fournier-viger.com/spmf/hirateyamana.pdf
            arguments=[0.05, 0, 365, 0, 365])  # min_sup, min_gap, max_gap, min_span, max_span
spmf.run()
spmf_freq_patterns = spmf.to_pandas_dataframe()
spmf_freq_patterns = [([[item.replace('_', ' ') for item in re.sub('(<\d> )', '', event).split()] for event in sequence], sup) 
                      for sequence, sup in zip(spmf_freq_patterns.pattern, spmf_freq_patterns.sup)]
spmf_freq_patterns.sort(key=lambda tup: (tup[1], neg(sequence_length(tup[0]))), reverse=True)
list(filter(lambda freq_pattern: sequence_length(freq_pattern[0]) >= 2, spmf_freq_patterns))

>/home/donato/donato.meoli.95@gmail.com/DataMiningUniPi/DM_Group18_TASK4/spmf.jar
 Total time ~ 14606 ms
 Frequent sequences count : 527

Post-processing to show result in terms of string values.
Post-processing completed.



[([['GREEN REGENCY TEACUP AND SAUCER', 'PINK REGENCY TEACUP AND SAUCER']],
  167),
 ([['LUNCH BAG PINK POLKADOT', 'LUNCH BAG RED RETROSPOT']], 160),
 ([['GREEN REGENCY TEACUP AND SAUCER', 'ROSES REGENCY TEACUP AND SAUCER']],
  159),
 ([['JUMBO BAG PINK POLKADOT', 'JUMBO BAG RED RETROSPOT']], 158),
 ([['LUNCH BAG RED RETROSPOT', 'LUNCH BAG BLACK SKULL']], 153),
 ([['PAPER CHAIN KIT CHRISTMAS', 'PAPER CHAIN KIT VINTAGE CHRISTMAS']], 151),
 ([['ROSES REGENCY TEACUP AND SAUCER', 'GREEN REGENCY TEACUP AND SAUCER']],
  150),
 ([['LUNCH BAG CARS BLUE', 'LUNCH BAG RED RETROSPOT']], 150),
 ([['LUNCH BAG RED RETROSPOT', 'LUNCH BAG PINK POLKADOT']], 150),
 ([['LUNCH BAG CARS BLUE', 'LUNCH BAG PINK POLKADOT']], 149),
 ([['ALARM CLOCK BAKELIKE RED', 'ALARM CLOCK BAKELIKE GREEN']], 147),
 ([['LUNCH BAG SUKI DESIGN', 'LUNCH BAG SPACEBOY DESIGN']], 147),
 ([['GARDENERS KNEELING PAD KEEP CALM', 'GARDENERS KNEELING PAD CUP OF TEA']],
  146),
 ([['ALARM CLOCK BAKELIKE GREEN', 'ALARM CLOCK BAKELIKE RED']]

Custom implementation with time constraints:
- the overall duration of the pattern instance must be at most of 1 month;
- each element of the pattern instance must be at least 1 day after the previous one;
- each element of the pattern instance must be at most 1 week after the previous one.

In [70]:
%time custom_freq_patterns = gsp(baskets_sequences, min_sup=0.07, max_span=30, min_gap=1, max_gap=7, verbose=False)
list(filter(lambda freq_pattern: sequence_length(freq_pattern[0]) >= 2, custom_freq_patterns))

CPU times: user 46min 25s, sys: 80 ms, total: 46min 25s
Wall time: 46min 25s


[([['GREEN REGENCY TEACUP AND SAUCER', 'ROSES REGENCY TEACUP AND SAUCER']],
  254),
 ([['PAPER CHAIN KIT CHRISTMAS', 'PAPER CHAIN KIT VINTAGE CHRISTMAS']], 247),
 ([['GREEN REGENCY TEACUP AND SAUCER', 'PINK REGENCY TEACUP AND SAUCER']],
  239),
 ([['LUNCH BAG PINK POLKADOT', 'LUNCH BAG RED RETROSPOT']], 231),
 ([['LUNCH BAG BLACK SKULL', 'LUNCH BAG RED RETROSPOT']], 228),
 ([['LUNCH BAG CARS BLUE', 'LUNCH BAG RED RETROSPOT']], 225),
 ([['PARTY BUNTING', 'SPOTTY BUNTING']], 223),
 ([['GARDENERS KNEELING PAD CUP OF TEA', 'GARDENERS KNEELING PAD KEEP CALM']],
  222),
 ([['HEART OF WICKER LARGE', 'HEART OF WICKER SMALL']], 222),
 ([['LUNCH BAG RED RETROSPOT', 'LUNCH BAG SUKI DESIGN']], 222),
 ([['JUMBO BAG PINK POLKADOT', 'JUMBO BAG RED RETROSPOT']], 221),
 ([['PINK REGENCY TEACUP AND SAUCER', 'ROSES REGENCY TEACUP AND SAUCER']],
  220),
 ([['LUNCH BAG CARS BLUE', 'LUNCH BAG PINK POLKADOT']], 218),
 ([['LUNCH BAG CARS BLUE', 'LUNCH BAG SUKI DESIGN']], 215),
 ([['LUNCH BAG RED RETROSPOT', '

SPMF implementation with the same time constraints:

In [72]:
spmf_baskets_sequences = spmf_encode_with_timestamp(baskets_sequences)
spmf = Spmf('HirateYamana', input_direct=spmf_baskets_sequences,  # http://www.philippe-fournier-viger.com/spmf/hirateyamana.pdf
            arguments=[0.05, 1, 7, 0, 30])  # min_sup, min_gap, max_gap, min_span, max_span
spmf.run()
spmf_freq_patterns = spmf.to_pandas_dataframe()
spmf_freq_patterns = [([[item.replace('_', ' ') for item in re.sub('(<\d> )', '', event).split()] for event in sequence], sup) 
                      for sequence, sup in zip(spmf_freq_patterns.pattern, spmf_freq_patterns.sup)]
spmf_freq_patterns.sort(key=lambda tup: (tup[1], neg(sequence_length(tup[0]))), reverse=True)
list(filter(lambda freq_pattern: sequence_length(freq_pattern[0]) >= 2, spmf_freq_patterns))

>/home/donato/donato.meoli.95@gmail.com/DataMiningUniPi/DM_Group18_TASK4/spmf.jar
 Total time ~ 2607 ms
 Frequent sequences count : 527

Post-processing to show result in terms of string values.
Post-processing completed.



[([['GREEN REGENCY TEACUP AND SAUCER', 'PINK REGENCY TEACUP AND SAUCER']],
  167),
 ([['LUNCH BAG PINK POLKADOT', 'LUNCH BAG RED RETROSPOT']], 160),
 ([['GREEN REGENCY TEACUP AND SAUCER', 'ROSES REGENCY TEACUP AND SAUCER']],
  159),
 ([['JUMBO BAG PINK POLKADOT', 'JUMBO BAG RED RETROSPOT']], 158),
 ([['LUNCH BAG RED RETROSPOT', 'LUNCH BAG BLACK SKULL']], 153),
 ([['PAPER CHAIN KIT CHRISTMAS', 'PAPER CHAIN KIT VINTAGE CHRISTMAS']], 151),
 ([['ROSES REGENCY TEACUP AND SAUCER', 'GREEN REGENCY TEACUP AND SAUCER']],
  150),
 ([['LUNCH BAG CARS BLUE', 'LUNCH BAG RED RETROSPOT']], 150),
 ([['LUNCH BAG RED RETROSPOT', 'LUNCH BAG PINK POLKADOT']], 150),
 ([['LUNCH BAG CARS BLUE', 'LUNCH BAG PINK POLKADOT']], 149),
 ([['ALARM CLOCK BAKELIKE RED', 'ALARM CLOCK BAKELIKE GREEN']], 147),
 ([['LUNCH BAG SUKI DESIGN', 'LUNCH BAG SPACEBOY DESIGN']], 147),
 ([['GARDENERS KNEELING PAD KEEP CALM', 'GARDENERS KNEELING PAD CUP OF TEA']],
  146),
 ([['ALARM CLOCK BAKELIKE GREEN', 'ALARM CLOCK BAKELIKE RED']]

Please notice that since probably SPMF uses a different counting method for the sequences support with more than one item in the same event, some frequent patterns resulting from the custom implementation will be visible as a result of the SPMF implementation with lower support after reducing the *min_sup* value.