# PrefixSpan

In [11]:
import copy
from collections import defaultdict

In [1]:
def projectSequence(sequence, prefix, newEvent):
    result = None
    for i, itemset in enumerate(sequence):
        if result is None:
            if (not newEvent) or i > 0:
                if (all(x in itemset for x in prefix)):
                    result = [list(itemset)]
        else:
            result.append(copy.copy(itemset))
    return result

In [2]:
def projectDatabase(dataset, prefix, newEvent):
    projectedDB = []
    for sequence in dataset:
        seqProjected = projectSequence(sequence, prefix, newEvent)
        if not seqProjected is None:
            projectedDB.append(seqProjected)
    return projectedDB

In [4]:
def generateItems(dataset):
    return sorted(set ([item for sublist1 in dataset for sublist2 in sublist1 for item in sublist2]))

def generateItemSupports(dataset, ignoreFirstEvent=False, prefix=[]):
    result = defaultdict(int)
    for sequence in dataset:
        if ignoreFirstEvent:
            sequence = sequence[1:]
        cooccurringItems = set()
        for itemset in sequence:
            if all(x in itemset for x in prefix):
                for item in itemset:
                    if not item in prefix:
                        cooccurringItems.add(item)
        for item in cooccurringItems:
            result [item] += 1
    return sorted(result.items())

In [5]:
def prefixSpan(dataset, minSupport):
    result = []
    itemCounts = generateItemSupports(dataset)
    for item, count in itemCounts:
        if count >= minSupport:
            newPrefix = [[item]]
            result.append((newPrefix, count))
            result.extend(prefixSpanInternal(projectDatabase(dataset, [item], False), minSupport, newPrefix))
    return result

def prefixSpanInternal(dataset, minSupport, prevPrefixes=[]):
    result = []
    
    # Add a new item to the last element (==same time)
    itemCountSameEvent = generateItemSupports(dataset, False, prefix=prevPrefixes[-1])
    for item, count in itemCountSameEvent:
        if (count >= minSupport) and item > prevPrefixes[-1][-1]:
            newPrefix = copy.deepcopy(prevPrefixes)
            newPrefix[-1].append(item)
            result.append((newPrefix, count))
            result.extend(prefixSpanInternal(projectDatabase(dataset, newPrefix[-1], False), minSupport, newPrefix))
        
    # Add a new event to the prefix
    itemCountSubsequentEvents = generateItemSupports(dataset, True)
    for item, count in itemCountSubsequentEvents:
        if count >= minSupport:
            newPrefix = copy.deepcopy(prevPrefixes)
            newPrefix.append([item])
            result.append((newPrefix, count))
            result.extend(prefixSpanInternal(projectDatabase(dataset, [item], True), minSupport, newPrefix))
    return result

In [7]:
dataset =  [
    [["a"], ["a", "b", "c"], ["a", "c"], ["c"]],
    [["a"], ["c"], ["b", "c"]],
    [["a", "b"], ["d"], ["c"], ["b"], ["c"]],
    [["a"], ["c"], ["b"], ["c"]]
]

# Load Data

In [2]:
import numpy as np
import pandas as pd
with open('./trade_new.csv', 'r') as resourse_data:
    data = pd.read_csv(resourse_data)[['vipno', 'sldatime', 'pluno', 'dptno', 'bndno']]
data['sldatime'] = pd.to_datetime(data['sldatime'])
sort_data = data.sort_values(['sldatime'],ascending=True).groupby(['vipno', 'sldatime'])
print([x for x in sort_data])

[((781924, Timestamp('2016-04-04 19:12:24')),         vipno            sldatime     pluno  dptno    bndno
16474  781924 2016-04-04 19:12:24  14402009  14402  14350.0
18227  781924 2016-04-04 19:12:24  11533012  11533      NaN
11173  781924 2016-04-04 19:12:24  11532011  11532  11129.0
17419  781924 2016-04-04 19:12:24  15200007  15200  15094.0
3100   781924 2016-04-04 19:12:24  11531020  11531  11149.0
21734  781924 2016-04-04 19:12:24  10130009  10130  10106.0
6822   781924 2016-04-04 19:12:24  14050019  14050  14082.0
3491   781924 2016-04-04 19:12:24  14014034  14014  14759.0), ((781924, Timestamp('2016-05-01 13:48:21')),         vipno            sldatime     pluno  dptno    bndno
18229  781924 2016-05-01 13:48:21  11533012  11533      NaN
18230  781924 2016-05-01 13:48:21  11533012  11533      NaN
17456  781924 2016-05-01 13:48:21  15200007  15200  15094.0
22051  781924 2016-05-01 13:48:21  14403083  14403      NaN
3174   781924 2016-05-01 13:48:21  10113009  10113  10706.0), ((781

In [3]:
sort_data2 = data.sort_values(['sldatime'],ascending=True).groupby('vipno')
pluno_sequence = []
for x in sort_data2:
    pluno_slice = []
    x2 = x[1].groupby('sldatime')
    for x3 in x2:
#         print(x3)
        temp = list(x3[1]['pluno'])
#         print(temp)
        pluno_slice.append(temp)
#     print("\n")
#     print(pluno_slice)
    pluno_sequence.append(pluno_slice)
print(pluno_sequence)

[[[14402009, 11533012, 11532011, 15200007, 11531020, 10130009, 14050019, 14014034], [11533012, 11533012, 15200007, 14403083, 10113009], [15200007, 15202012, 11532036, 15113000], [11302032], [15200001, 14101028, 34023002, 14082002, 15120000, 23113024]], [[22712001], [15232004, 27000007, 15120003], [22712001], [14815007, 15120003], [15120003, 24401008], [10300021, 10300042], [15120003, 15114000], [10201017, 10201017, 15114000], [27000582, 10150038], [23110001], [15120006, 23132061], [10450048]], [[23112032], [14701035, 14405032, 14617001], [14802054, 22010034], [14300086, 22007027], [27100542], [22190002, 14860025, 14750043], [22172004, 23110009, 22103001, 22172000], [22190000, 22170003, 22130010], [22170003, 22800001, 22008012, 14750042], [22103001, 27100542, 22001006, 23110009, 22013007, 22701014, 22601000], [14701015], [22100010, 22002240, 22103010], [24010812, 22103005, 22601002, 22103006], [22103011, 14750062, 22102005, 22103005, 22036000, 25120014, 14750062, 25405948], [27030020, 2

In [67]:
prefixSpan(pluno_sequence, 32)

[([[15110001]], 32),
 ([[15110071]], 37),
 ([[15119001]], 33),
 ([[15130027]], 55),
 ([[15200007]], 40),
 ([[21801125]], 34),
 ([[22008019]], 38),
 ([[22008020]], 46),
 ([[22008021]], 45),
 ([[22034000]], 36),
 ([[22035000]], 37),
 ([[22036000]], 127),
 ([[22036000, 30380003]], 34),
 ([[22036000], [22036000]], 47),
 ([[22036000], [30380002]], 35),
 ([[22036000], [30380003]], 47),
 ([[22101001]], 47),
 ([[22102000]], 32),
 ([[22102004]], 34),
 ([[22102005]], 77),
 ([[22102005], [30380003]], 35),
 ([[22102014]], 98),
 ([[22102014], [30380002]], 33),
 ([[22102014], [30380003]], 43),
 ([[22111004]], 56),
 ([[22130000]], 32),
 ([[22170001]], 40),
 ([[22500022]], 40),
 ([[22601000]], 46),
 ([[22701014]], 32),
 ([[23110001]], 88),
 ([[23110001], [30380003]], 40),
 ([[23110007]], 65),
 ([[23110009]], 153),
 ([[23110009, 30380002]], 38),
 ([[23110009, 30380003]], 40),
 ([[23110009], [23110009]], 63),
 ([[23110009], [30380002]], 40),
 ([[23110009], [30380003]], 62),
 ([[23120001]], 58),
 ([[2313

In [12]:
prefixSpan(dataset, 2)

[([['a']], 4),
 ([['a', 'b']], 2),
 ([['a', 'b'], ['c']], 2),
 ([['a', 'b'], ['c'], ['c']], 2),
 ([['a'], ['b']], 4),
 ([['a'], ['b', 'c']], 2),
 ([['a'], ['b'], ['c']], 3),
 ([['a'], ['c']], 4),
 ([['a'], ['c'], ['b']], 3),
 ([['a'], ['c'], ['b'], ['c']], 2),
 ([['a'], ['c'], ['c']], 4),
 ([['b']], 4),
 ([['b', 'c']], 2),
 ([['b'], ['c']], 3),
 ([['b'], ['c'], ['c']], 2),
 ([['c']], 4),
 ([['c'], ['b']], 3),
 ([['c'], ['b'], ['c']], 2),
 ([['c'], ['c']], 4)]