In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('dataset_apriori.csv')

In [3]:
dataset

Unnamed: 0,tid,items
0,1,"Handphone,Laptop"
1,2,"Handphone,Charger,Laptop"
2,3,"Powerbank,Laptop,Charger,Handphone"
3,4,"Tablet,Laptop,Handphone"
4,5,"Handphone,Charger,Tablet"
5,6,"Tablet,Powerbank"
6,7,"Handphone,Laptop,Tablet,Charger"
7,8,"Charger,Handphone"
8,9,"Handphone,Powerbank"
9,10,"Laptop,Charger,Powerbank"


In [4]:
df_items = dataset['items']
df_tid = dataset['tid']

In [5]:
df_items

0                      Handphone,Laptop
1              Handphone,Charger,Laptop
2    Powerbank,Laptop,Charger,Handphone
3               Tablet,Laptop,Handphone
4              Handphone,Charger,Tablet
5                      Tablet,Powerbank
6       Handphone,Laptop,Tablet,Charger
7                     Charger,Handphone
8                   Handphone,Powerbank
9              Laptop,Charger,Powerbank
Name: items, dtype: object

In [6]:
df_tid

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
Name: tid, dtype: int64

<h2>Convert Items to Number</h2>

In [7]:
dictionaries = {'Handphone': 1, 'Laptop': 2, 'Charger': 3, 'Powerbank': 4, 'Tablet': 5 }

In [8]:
type(df_items)

pandas.core.series.Series

In [9]:
comma_splitted_df = df_items.apply(lambda x: x.split(','))

In [10]:
comma_splitted_df

0                        [Handphone, Laptop]
1               [Handphone, Charger, Laptop]
2    [Powerbank, Laptop, Charger, Handphone]
3                [Tablet, Laptop, Handphone]
4               [Handphone, Charger, Tablet]
5                        [Tablet, Powerbank]
6       [Handphone, Laptop, Tablet, Charger]
7                       [Charger, Handphone]
8                     [Handphone, Powerbank]
9               [Laptop, Charger, Powerbank]
Name: items, dtype: object

In [11]:
numbered_col = []
for i in range(len(comma_splitted_df)):
    list_numbered = list(map(lambda x: dictionaries[x], comma_splitted_df[i]))
    sort_numbered = sorted(list_numbered)
    numbered_col.append(sort_numbered)

numbered_col

[[1, 2],
 [1, 2, 3],
 [1, 2, 3, 4],
 [1, 2, 5],
 [1, 3, 5],
 [4, 5],
 [1, 2, 3, 5],
 [1, 3],
 [1, 4],
 [2, 3, 4]]

<h2>Create Dataframe from Numbered Items</h2>

In [12]:
dict_data = {'items': numbered_col}
df = pd.DataFrame.from_dict(dict_data)

In [13]:
df

Unnamed: 0,items
0,"[1, 2]"
1,"[1, 2, 3]"
2,"[1, 2, 3, 4]"
3,"[1, 2, 5]"
4,"[1, 3, 5]"
5,"[4, 5]"
6,"[1, 2, 3, 5]"
7,"[1, 3]"
8,"[1, 4]"
9,"[2, 3, 4]"


In [14]:
pd.concat([df_tid, df], axis=1)

Unnamed: 0,tid,items
0,1,"[1, 2]"
1,2,"[1, 2, 3]"
2,3,"[1, 2, 3, 4]"
3,4,"[1, 2, 5]"
4,5,"[1, 3, 5]"
5,6,"[4, 5]"
6,7,"[1, 2, 3, 5]"
7,8,"[1, 3]"
8,9,"[1, 4]"
9,10,"[2, 3, 4]"


In [15]:
items = []
for i in range(len(df)):
    for j in range(len(df['items'][i])):
        items.append(df['items'][i][j])
items

[1,
 2,
 1,
 2,
 3,
 1,
 2,
 3,
 4,
 1,
 2,
 5,
 1,
 3,
 5,
 4,
 5,
 1,
 2,
 3,
 5,
 1,
 3,
 1,
 4,
 2,
 3,
 4]

<h2>Creating First Candidate (C1)</h2>

In [16]:
#Get unique element from list/array
unique_item = set(items)
unique_item

{1, 2, 3, 4, 5}

In [17]:
#Convert it to list
list_unique_item = list(unique_item)
list_unique_item

[1, 2, 3, 4, 5]

In [18]:
count_unique = []
for value in (list_unique_item):
    count_unique.append((value, items.count(value)))
count_unique

[(1, 8), (2, 6), (3, 6), (4, 4), (5, 4)]

In [19]:
candidate1_df = pd.DataFrame(count_unique, columns=["itemset", "sup"])

In [20]:
candidate1_df

Unnamed: 0,itemset,sup
0,1,8
1,2,6
2,3,6
3,4,4
4,5,4


<h2>Creating first Frequent Itemset (L1)</h2>

In [535]:
minimum_sup = 2

In [536]:
filter_sup = candidate1_df['sup'] > minimum_sup
freq_itemset1 = candidate1_df[filter_sup]

In [538]:
freq_itemset1

Unnamed: 0,itemset,sup
0,1,8
1,2,6
2,3,6
3,4,4
4,5,4


## Create the Second Candidate (C2)

In [539]:
import numpy
def self_join(prev_freq_itemset):
    self_join_candidate = []
    for i in range(len(prev_freq_itemset['itemset'])):
        for j in range((i+1), len(prev_freq_itemset['itemset'])):
            itemset_i = prev_freq_itemset['itemset'][i]
            itemset_j = prev_freq_itemset['itemset'][j]
            if(type(itemset_i) == numpy.int64 and type(itemset_j) == numpy.int64):
                itemset_i = {itemset_i}
                itemset_j = {itemset_j}
            union_candidate = itemset_i.union(itemset_j)

            if union_candidate not in self_join_candidate:
                self_join_candidate.append(union_candidate)
    return self_join_candidate

In [540]:
candidate2_list = self_join(freq_itemset1)

In [541]:
candidate2_list

[{1, 2},
 {1, 3},
 {1, 4},
 {1, 5},
 {2, 3},
 {2, 4},
 {2, 5},
 {3, 4},
 {3, 5},
 {4, 5}]

In [543]:
count_candidate2 = []

#Set the Initial value of Second Count Candidate (C2)
for i in range(len(candidate2_list)):
    count_candidate2.append((candidate2_list[i], 0))

count_candidate2

[({1, 2}, 0),
 ({1, 3}, 0),
 ({1, 4}, 0),
 ({1, 5}, 0),
 ({2, 3}, 0),
 ({2, 4}, 0),
 ({2, 5}, 0),
 ({3, 4}, 0),
 ({3, 5}, 0),
 ({4, 5}, 0)]

In [544]:
initial_df_candidate = pd.DataFrame(count_candidate2, columns=['itemset', 'sup'])

In [503]:
initial_df_candidate

Unnamed: 0,itemset,sup
0,"{1, 2}",0
1,"{1, 3}",0
2,"{1, 4}",0
3,"{1, 5}",0
4,"{2, 3}",0
5,"{2, 4}",0
6,"{2, 5}",0
7,"{3, 4}",0
8,"{3, 5}",0
9,"{4, 5}",0


In [504]:
#Let's see what's inside df
df

Unnamed: 0,items
0,"[1, 2]"
1,"[1, 2, 3]"
2,"[1, 2, 3, 4]"
3,"[1, 2, 5]"
4,"[1, 3, 5]"
5,"[4, 5]"
6,"[1, 2, 3, 5]"
7,"[1, 3]"
8,"[1, 4]"
9,"[2, 3, 4]"


In [550]:
#Let's add it with 1 whenever we found every candidate is a subset from Database D


def count_support(database_dataframe, prev_candidate_list):
    initial_df_candidate['sup'] = 0 #set All value into 0 only for initial value for consistency value when running this cell everytime.
    count_prev_candidate = []

    #Set the Initial value of Previous Candidate
    for i in range(len(prev_candidate_list)):
        count_prev_candidate.append((prev_candidate_list[i], 0))
    
    df_candidate = pd.DataFrame(count_prev_candidate, columns=['itemset', 'sup'])
    print('Database D dataframe\n', database_dataframe)
    print('(Initial) Dataframe from Candidate with All zeros sup\n', df_candidate)
    
    for i in range(len(database_dataframe)):
        for j in range(len(count_prev_candidate)):
            #using issubset() function to check whether every itemset is a subset of Database or not
            if (df_candidate['itemset'][j]).issubset(set(database_dataframe['items'][i])): 
                df_candidate.loc[j, 'sup'] += 1
            
    return df_candidate

In [506]:
count_candidate2_df = count_support(df, candidate2_list)

Database D dataframe
           items
0        [1, 2]
1     [1, 2, 3]
2  [1, 2, 3, 4]
3     [1, 2, 5]
4     [1, 3, 5]
5        [4, 5]
6  [1, 2, 3, 5]
7        [1, 3]
8        [1, 4]
9     [2, 3, 4]
Dataframe from Candidate with All zeros sup
   itemset  sup
0  {1, 2}    0
1  {1, 3}    0
2  {1, 4}    0
3  {1, 5}    0
4  {2, 3}    0
5  {2, 4}    0
6  {2, 5}    0
7  {3, 4}    0
8  {3, 5}    0
9  {4, 5}    0


# Creating Second Frequent Itemset (L2)

In [507]:
#Filter the itemset based on minimum support (occurences of items)
freq_itemset2 = count_candidate2_df[ count_candidate2_df['sup'] > minimum_sup ]

In [508]:
freq_itemset2

Unnamed: 0,itemset,sup
0,"{1, 2}",5
1,"{1, 3}",5
4,"{2, 3}",4


In [509]:
freq_itemset2_reset = freq_itemset2.reset_index(drop=True)

In [510]:
#We need to reset the index, because need to access the index later.
freq_itemset2_reset

Unnamed: 0,itemset,sup
0,"{1, 2}",5
1,"{1, 3}",5
2,"{2, 3}",4


## Creating the Third Candidate (C3) - Using the Candidate Forming Technique
* Self join
* Pruning

## Self Join

In [511]:
print(freq_itemset2_reset)
self_join_result = self_join(freq_itemset2_reset)

  itemset  sup
0  {1, 2}    5
1  {1, 3}    5
2  {2, 3}    4


## Pruning

In [512]:
def get_subset(candidate):
    temp = []
    final = []
    for i in range(len(candidate)):
        for j in range(len(candidate)):
            if i != j:
                temp.append(candidate[j])
        temp_set = set(temp)
        final.append(temp_set)
        temp.clear()
    print('Subset from {} : {}'.format(candidate, final))
    return final

def pruning(candidate_set, prev_freq_itemset):
    temp_candidate = []
    print('Candidate set', candidate_set)
    
    for idx, value in enumerate(candidate_set):
        list_candidate = list(value)
        temp_candidate.append(get_subset(list_candidate))
        
        for temp_item in temp_candidate:
            check = temp_item == prev_freq_itemset['itemset']
            print('\nCheck candidate from Previous Frequent Itemset\n', check)
            
            if any(check) == False:
                print(any(check))
                print('Val', value)
                candidate_set.remove(value)
                print(candidate_set)
            else:
                print('\nAll of {} subset contained in \n{}'.format(candidate_set, prev_freq_itemset))
                
    return candidate_set

In [513]:
freq_itemset2_reset

Unnamed: 0,itemset,sup
0,"{1, 2}",5
1,"{1, 3}",5
2,"{2, 3}",4


In [551]:
candidate3_list = pruning(self_join_result, freq_itemset2_reset)

Candidate set [{1, 2, 3}]
Subset from [1, 2, 3] : [{2, 3}, {1, 3}, {1, 2}]

Check candidate from Previous Frequent Itemset
 0    False
1     True
2    False
Name: itemset, dtype: bool

All of [{1, 2, 3}] subset contained in 
  itemset  sup
0  {1, 2}    5
1  {1, 3}    5
2  {2, 3}    4


In [521]:
candidate3_list

[{1, 2, 3}]

<h2>Creating the Third Frequent Itemset (L3)</h2>

In [516]:
#Let's see the database again
df

Unnamed: 0,items
0,"[1, 2]"
1,"[1, 2, 3]"
2,"[1, 2, 3, 4]"
3,"[1, 2, 5]"
4,"[1, 3, 5]"
5,"[4, 5]"
6,"[1, 2, 3, 5]"
7,"[1, 3]"
8,"[1, 4]"
9,"[2, 3, 4]"


In [517]:
#Then check the newest candidate value
candidate3_list

[{1, 2, 3}]

In [554]:
count_candidate3_df = count_support(df, candidate3_list)

Database D dataframe
           items
0        [1, 2]
1     [1, 2, 3]
2  [1, 2, 3, 4]
3     [1, 2, 5]
4     [1, 3, 5]
5        [4, 5]
6  [1, 2, 3, 5]
7        [1, 3]
8        [1, 4]
9     [2, 3, 4]
(Initial) Dataframe from Candidate with All zeros sup
      itemset  sup
0  {1, 2, 3}    0


In [555]:
count_candidate3_df

Unnamed: 0,itemset,sup
0,"{1, 2, 3}",3


In [552]:
freq_itemset3 = count_candidate3_df[count_candidate3_df['sup'] > minimum_sup]

In [553]:
freq_itemset3

Unnamed: 0,itemset,sup
0,"{1, 2, 3}",3


## All Frequent Itemset

In [556]:
#Let'see each frequent itemset (L)
freq_itemset1

Unnamed: 0,itemset,sup
0,1,8
1,2,6
2,3,6
3,4,4
4,5,4


In [557]:
freq_itemset2

Unnamed: 0,itemset,sup
0,"{1, 2}",5
1,"{1, 3}",5
4,"{2, 3}",4


In [558]:
freq_itemset3

Unnamed: 0,itemset,sup
0,"{1, 2, 3}",3


In [563]:
frequent_itemset = pd.concat([freq_itemset1, freq_itemset2, freq_itemset3])

In [564]:
frequent_itemset

Unnamed: 0,itemset,sup
0,1,8
1,2,6
2,3,6
3,4,4
4,5,4
0,"{1, 2}",5
1,"{1, 3}",5
4,"{2, 3}",4
0,"{1, 2, 3}",3


In [565]:
#Reset the index
frequent_itemset_final = frequent_itemset.reset_index(drop=True)

## Final Output of Freq. Itemset (L1-L3)

In [567]:
frequent_itemset_final

Unnamed: 0,itemset,sup
0,1,8
1,2,6
2,3,6
3,4,4
4,5,4
5,"{1, 2}",5
6,"{1, 3}",5
7,"{2, 3}",4
8,"{1, 2, 3}",3


# MY Playground

In [36]:
a = [1,3,4]
b = [2, 3, 5]

In [37]:
setA = set(a)

In [38]:
set([1,3]).issubset(setA)

True

In [39]:
set([2,3]).issubset(set(b))

True

In [40]:
a = set([5, 6])
b = set([6, 7])
c = a & b # get the itersection
if len(c) == len(a) - 1:
    print (a | b) # their union

{5, 6, 7}


In [41]:
alist = []
alist.append(filter_df['itemset'][0])
alist

NameError: name 'filter_df' is not defined

In [None]:
lista = [5, 6, 7]
idx = lista.index(5)
lista[idx] = 7
lista

### Update Tuple Value using little Manipulation

In [None]:
setA = {1, 2}
setB = {1, 3}
tempList = []
tempList.append((setA, 1))
tempList.append((setB, 2))
tempList

In [None]:
for idx, value in enumerate(tempList):
    idx_tmp = tempList.index((value))
    ch = list(tempList[idx_tmp])
    print('Loop ke-', idx, ch)

In [None]:
ch[1]+=1

In [None]:
ch

In [None]:
tempList[idx_tmp] = tuple(ch)

In [None]:
tempList

In [None]:
tuple_df = pd.DataFrame(tempList, columns=['items', 'sup.'])

In [None]:
tuple_df

In [None]:
#Sample Test
s = ({1,0},0,0,0),(2,3,0,0,),(4,5,6,0,),(7,8,9,10,)
print(list(s))
try_df = pd.DataFrame(list(s), columns=['a', 'b', 'c', 'd'])
print (try_df)

In [None]:
%pastebin 1-3

In [None]:
try_df['a'][0]

In [None]:
type(try_df['a'][0])

In [135]:
input_1 = [[1,1,1],[0,0,0]]
input_1

[[1, 1, 1], [0, 0, 0]]

In [136]:
new_list = [ input_1[i][:2] for i in range(len(input_1)) ]
new_list

[[1, 1], [0, 0]]

In [200]:
freq_itemset2_reset['itemset']

0    {1, 2}
1    {1, 3}
2    {2, 3}
Name: itemset, dtype: object

In [327]:
#Check pruning
a = {3,4}
res = a == freq_itemset2_reset['itemset']
print(any(res))
if any(res) == False:
    print(res)

False
0    False
1    False
2    False
Name: itemset, dtype: bool


In [299]:
a = [{3,4,5}]
a.remove({3,4,5})

In [300]:
a

[]

In [464]:
{1}.union({2})

{1, 2}