In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from itertools import combinations
import copy

In [2]:
def n_star_cmp(n):
#     return (n+1)*unique_id
    return (n+1)

In [3]:
def list_is_subset(parent_list, child_list):
    return set(child_list).issubset(set(parent_list))

In [4]:
## summary: 3-D matrix (locations, pollutants, quartiles)
## samples: 2-D matrix (locations, pollutants)
## pollutant_names: 1-D array
def preprocessing(samples, summary, pollutant_names):
    database_codes = []

    for i in len(dataset):
        segment = dataset[i]
        for j in len(segment):
            attribute_value = segment[j]

            minimum = summary[i][j][0]
            first_quantile = summary[i][j][1]
            third_quantile = summary[i][j][3]

            if (attribute_value >= minimum) and (attribute_value < first_quantile):
                database_codes[i][j] = pollutant_names[j] + "1"
            elif (attribute_value >= first_quantile) and (attribute_value < third_quantile):
                database_codes[i][j] = pollutant_names[j] + "2"
            else:
                database_codes[i][j] = pollutant_names[j] + "3"

    return database_codes

In [5]:
## locations: 1-D array containing names of locations
## pollutants: 1-D array containing names of pollutants
## times: 1-D array containing names of time measures (January, February etc.)
def dict_generator(locations, pollutants, times):
    for i in range(len(locations)):
        location_dict[locations[i]] = i
    for j in range(len(times)):
        time_dict[times[j]] = j
    for k in range(len(pollutants)):
        pollutant_dict[pollutants[k]] = k

In [6]:
# Rewrite to decrease complexity
def get_hash_id_from_cms(cms, location, time, no_of_stars):
    hash_id_entry = cms.loc[(cms['location'] == location) & (cms['time'] == time)]
    hash_id = None
    if not (hash_id_entry.empty) and (no_of_stars >= 0):
        hash_id = hash_id_entry['hash_id'][hash_id_entry.index.values[0]]
    else:
        hash_id_entry_loc_and_star = cms.loc[(cms['location'] == location) & (cms['time'] == '*')]
        hash_id_entry_time_and_star = cms.loc[(cms['location'] == '*') & (cms['time'] == time)]
        
        if not (hash_id_entry_loc_and_star.empty) and (no_of_stars >= 1):
            hash_id = hash_id_entry_loc_and_star['hash_id'][hash_id_entry_loc_and_star.index.values[0]]
        elif not (hash_id_entry_time_and_star.empty) and (no_of_stars >= 1):
            hash_id = hash_id_entry_time_and_star['hash_id'][hash_id_entry_time_and_star.index.values[0]]
        else:
            hash_id_entry_star_and_star = cms.loc[(cms['location'] == '*') & (cms['time'] == '*')]
            if not (hash_id_entry_star_and_star.empty) and (no_of_stars >= 2):
                hash_id = hash_id_entry_star_and_star['hash_id'][hash_id_entry_star_and_star.index.values[0]]
    
    return hash_id

In [7]:
# Step 0
def get_transactions_from_dataset():
    transactions = pd.DataFrame(columns=['transaction','location','time'])
    
    test_data = [
        [['I2','I3'], 'S1', 'T1'],
        [['I1','I3'], 'S1', 'T2'],
        [['I1','I3','I4'], 'S1', 'T2'],
        [['I1', 'I2'], 'S2', 'T2'],
        [['I2', 'I4'], 'S2', 'T2'],
        [['I2'], 'S1', 'T1']
    ]
    for datax in test_data:
        transactions.loc[len(transactions)] = datax
    
    return transactions

In [8]:
# Step 1
## locations: 1-D array containing names of locations
## times: 1-D array containing names of time measures (January, February etc.)
def calendar_map_schema_generator(transactions):
    
    # correct it to add only stars for possible locations and times. e.g. s2 doesnt need stars
    locations = transactions['location'].unique()
    times = transactions['time'].unique()
    
    location_star_keys = set()
    for location in locations:
        rows = transactions.loc[transactions['location'] == location]
        unique_times = rows['time'].unique()
        if len(unique_times) > 1:
            location_star_keys.add(location)
    
    time_star_keys = set()
    for time in times:
        rows = transactions.loc[transactions['time'] == time]
        unique_locations = rows['location'].unique()
        if len(unique_locations) > 1:
            time_star_keys.add(time)
    
    location_stars = np.array(['*' for i in range(len(location_star_keys))])
    time_stars = np.array(['*' for i in range(len(time_star_keys))])
    
    calendar_map_schema = pd.DataFrame(columns=['location', 'time', 'sid', 'hash_id'])
    
    sid = 0
    for location in locations:
        for time in times:
            calendar_map_schema.loc[len(calendar_map_schema)] = [location, time, sid, n_star_cmp(sid)]
            sid += 1
            
    hash_id = 21
    for location in location_star_keys:
        calendar_map_schema.loc[len(calendar_map_schema)] = [location, '*', sid, n_star_cmp(sid)]
        sid += 1
    
    for time in time_star_keys:
        calendar_map_schema.loc[len(calendar_map_schema)] = ['*', time, sid, n_star_cmp(sid)]
        sid += 1

    calendar_map_schema.loc[len(calendar_map_schema)] = ['*', '*', sid, n_star_cmp(sid)]

    return calendar_map_schema

In [9]:
# Step 2
def get_freq_2_itemset(transactions, locations, times, calendar_map_schema): 
    freq_2_items = pd.DataFrame(columns=['2-itemset','count','location','time','hash_id'])
    for location in locations:
        for time in times:
            rows = transactions.loc[(transactions['location'] == location) & (transactions['time'] == time)]
            rows = rows.query('transaction.str.len() >= 2')
            
            if len(rows) == 0:
                continue
            
            itemset_dict = defaultdict(lambda: 0)
            
            transaction_items = []
            for index in rows.index.values:
                row = rows.loc[index]
                transaction_item = row['transaction']
                transaction_items.extend(transaction_item)
             
            split_transactions = list(combinations(set(transaction_items), 2))
            
            for split_trans in split_transactions:
                split_trans_1 = list(split_trans)
                split_trans_1.sort()
                for row_trans in rows['transaction'].values:
                    if set(split_trans_1).issubset(set(row_trans)):
                        itemset_dict[tuple(split_trans_1)] += 1
                    else:
                        itemset_dict[tuple(split_trans_1)] += 0
                    
            calendar_entry_hash = calendar_map_schema.loc[(calendar_map_schema['location'] == location) & (calendar_map_schema['time'] == time)]['hash_id']
            hash_id = calendar_entry_hash[calendar_entry_hash.index.values[0]]
            new_row = [[],[], location, time, hash_id]
            for key,value in itemset_dict.items():
                new_row[0].append(list(key))
                new_row[1].append(value)
                
            freq_2_items.loc[len(freq_2_items)] = new_row
    return freq_2_items

In [10]:
# Step 3
def get_1_star_cmp(freq_2_itemset, calendar_map_schema):
    n1_star_cmp = pd.DataFrame(columns=['partition','2-itemset','supp','hash_1_star','count'])
    
    for index in freq_2_itemset.index.values:
        row = freq_2_itemset.loc[index]
        hash_1_star = []
        location = row['location']
        time = row['time']
        hash_1_star_loc = calendar_map_schema.loc[(calendar_map_schema['location'] == location) & (calendar_map_schema['time'] == '*')]
        hash_1_star_time = calendar_map_schema.loc[(calendar_map_schema['location'] == '*') & (calendar_map_schema['time'] == time)]
        new_count = []
        if not hash_1_star_loc.empty:
            hash_id = hash_1_star_loc['hash_id'][hash_1_star_loc.index.values[0]]
            hash_1_star.append(hash_id)
            new_count.append(1)
        
        if not hash_1_star_time.empty:
            hash_id = hash_1_star_time['hash_id'][hash_1_star_time.index.values[0]]
            hash_1_star.append(hash_id)
            new_count.append(1)
        
        n2_itemsets = row['2-itemset']
        supps = row['count']
        
        for itemset, supp in zip(n2_itemsets, supps):
            hash_1_star_to_insert = [] if supp == 0 else hash_1_star
            count_to_insert = [0] if supp == 0 else new_count
            new_row = [index, itemset, supp, hash_1_star_to_insert, count_to_insert]
            n1_star_cmp.loc[len(n1_star_cmp)] = new_row
        
    return n1_star_cmp

In [11]:
# Step 4
def get_2_star_cmp(n1_star_cmp, cms):
    n2_star_cmp = pd.DataFrame(columns=['partition','2-itemset','supp','hash_2_star','count'])
    for index in n1_star_cmp.index.values:
        row = n1_star_cmp.loc[index]
        hash_1_star = row['hash_1_star']
        new_row = copy.deepcopy(row)
        new_row = new_row.drop('hash_1_star')
        
        cms_2_star_hash = cms.loc[(cms['location'] == '*') & (cms['time'] == '*')]['hash_id']
        hash_id = cms_2_star_hash[cms_2_star_hash.index.values[0]]
        
        new_row['hash_2_star'] = [hash_id for i in range(len(hash_1_star))]
        n2_star_cmp.loc[len(n2_star_cmp)] = new_row
    
    return n2_star_cmp

In [12]:
# Step 5
def get_freq_3_itemset(freq_2_itemset):
    freq_3_itemset = pd.DataFrame(columns=['partition', '3-itemset', 'location', 'time', 'hash_id'])
    
    for index in freq_2_itemset.index.values:
        row = freq_2_itemset.loc[index]
        
        location = row['location']
        time = row['time']            
        old_itemsets = row['2-itemset']
        flatten = lambda l: [item for items in l for item in items]
        new_itemsets = set(flatten(old_itemsets))
        new_itemsets_comb = list(combinations(new_itemsets, 3))
        for itemset in new_itemsets_comb:
            new_itemset = list(itemset)
            new_itemset.sort()
            new_row = [index, new_itemset, row['location'], row['time'], row['hash_id']]
            freq_3_itemset.loc[len(freq_3_itemset)] = new_row
            
            
    return freq_3_itemset

In [13]:
# Step 6
def get_freq_3_itemset_1_star(freq_3_itemset, n1_star_cmp):
    freq_3_itemset_1_star = pd.DataFrame(columns=['partition', '3-itemset', 'supp', 'hash_1_star', 'count'])
    
    for index in freq_3_itemset.index.values:
        row = freq_3_itemset.loc[index]
        
        n1_star_cmp_rows = n1_star_cmp.loc[n1_star_cmp['partition'] == row['partition']]
        
        min_count = 1000
        star_row_chosen = None
        for star_index, star_row in n1_star_cmp_rows.iterrows():
            count = min(star_row['count'])
            if (count < min_count):
                star_row_chosen = star_row
                min_count = count
        
        new_row = [row['partition'], row['3-itemset'], star_row_chosen['supp'], star_row_chosen['hash_1_star'], star_row_chosen['count']]
        freq_3_itemset_1_star.loc[len(freq_3_itemset_1_star)] = new_row
        
    return freq_3_itemset_1_star

In [14]:
# Step 7
def get_freq_3_itemset_2_star(freq_3_itemset_1_star, n2_star_cmp):
    freq_3_itemset_2_star = pd.DataFrame(columns=['partition', '3-itemset', 'supp', 'hash_2_star', 'count'])
    
    for index, row in freq_3_itemset_1_star.iterrows():
        n2_star_cmp_rows = n2_star_cmp.loc[n2_star_cmp['partition'] == row['partition']]
        
        min_count = 1000
        star_row_chosen = None
        for star_index, star_row in n2_star_cmp_rows.iterrows():
            count = min(star_row['count'])
            if (count < min_count):
                star_row_chosen = star_row
                min_count = count
        
        new_row = copy.deepcopy(row)
        new_row.drop('hash_1_star')
        new_row['hash_2_star'] = star_row_chosen['hash_2_star']
        new_row['supp'] = star_row_chosen['supp']
        
        freq_3_itemset_2_star.loc[len(freq_3_itemset_2_star)] = new_row
    
    return freq_3_itemset_2_star

In [15]:
def main():
    locations = np.array(['S1','S2','S3'])
    times = np.array(['T1', 'T2'])
    items = np.array(['I1', 'I2', 'I3', 'I4', 'I5'])
    
    transactions = get_transactions_from_dataset()
    calendar_map_schema = calendar_map_schema_generator(transactions)
    
    #replace calendarmapschema with just hashes
    freq_2_itemset = get_freq_2_itemset(transactions, locations, times, calendar_map_schema)
    
    n1_star_cmp = get_1_star_cmp(freq_2_itemset, calendar_map_schema)
    
    n2_star_cmp = get_2_star_cmp(n1_star_cmp, calendar_map_schema)
    
    freq_3_itemset = get_freq_3_itemset(freq_2_itemset)
    
    freq_3_itemset_1_star = get_freq_3_itemset_1_star(freq_3_itemset, n1_star_cmp)
    
    freq_3_itemset_2_star = get_freq_3_itemset_2_star(freq_3_itemset_1_star, n2_star_cmp)
    
main()