### Task 5

In [1]:
import copy
import numpy as np
import pandas as pd
from collections import Counter

#### Apriori from lecture

In [2]:
def frequentItems(transactions, support):
    counter = Counter()
    for trans in transactions:
        counter.update(frozenset([t]) for t in trans)
    return set(item for item in counter if counter[item]/len(transactions) >= support), counter

def generateCandidates(L, k):
    candidates = set()
    for a in L:
        for b in L:
            union = a | b
            if len(union) == k and a != b:
                candidates.add(union)
    return candidates

def filterCandidates(transactions, itemsets, support):
    counter = Counter()
    for trans in transactions:
        subsets = [itemset for itemset in itemsets if itemset.issubset(trans)]
        counter.update(subsets)
    return set(item for item in counter if counter[item]/len(transactions) >= support), counter

def apriori(transactions, support):
    result = list()
    resultc = Counter()
    candidates, counter = frequentItems(transactions, support)
    result += candidates
    resultc += counter
    k = 2
    while candidates:
        candidates = generateCandidates(candidates, k)
        candidates,counter = filterCandidates(transactions, candidates, support)
        result += candidates
        resultc += counter
        k += 1
    resultc = {item:(resultc[item]/len(transactions)) for item in resultc}
    return result, resultc

#### Generate posibility always one to all-1

In [3]:
def generate_posibility(itemset):
    items = list(itemset)
    res = []
    for item in items:
        new_list = copy.deepcopy(items)
        new_list.remove(item)
        new_listr = []
        new_listr.append(item)
        res.append((new_list, new_listr))
    
    return res

#### Generate asociation rules

In [4]:
def generate_rules(itemsets, supports, min_confidence, metric):
    generated_rules = []

    for itemset in itemsets:
        if len(itemset) < 2:
            continue

        for entry in generate_posibility(itemset):
            
            left_side, right_side = entry
            
            if metric == "confidence":
                rule_confidence = supports[itemset] / supports[frozenset(entry[0])]
                
            elif metric == "lift":
                rule_confidence = supports[itemset] / (supports[frozenset(entry[0])] * supports[frozenset(entry[1])])
                
            elif metric == "conviction":
                rule_confidence = ((1-supports[frozenset(entry[1])]) / (1-supports[itemset] / supports[frozenset(entry[0])])) \
                if (1-supports[itemset] / supports[frozenset(entry[0])]) != 0 else 0

            else:
                raise ValueError("Metric must be confidence or lift or conviction.")
            if rule_confidence >= min_confidence:
                generated_rules.append((left_side, right_side, round(rule_confidence, 3), round(supports[itemset], 3)))     
    return sorted(generated_rules, key=lambda i: (i[3], i[2]), reverse=True)

#### Read the files

In [5]:
# read csv files
search_engine_map = pd.read_csv('../data/search_engine_map.csv')
visitors = pd.read_csv('../data/visitors.csv')
clicks = pd.read_csv('../data/clicks.csv')

#### General statistic

In [6]:
print("Total number of visitors: {}".format(visitors["VisitID"].nunique()))
print("Total number of referrers {}".format(search_engine_map["Referrer"].nunique()))
print("Total number of clicks {}".format(clicks["LocalID"].nunique()))
print("Total number of pages {}".format(clicks["PageName"].nunique()))
print()
print('Some statistic about times on the page and the page score.')
display(clicks[['TimeOnPage', 'PageScore']].describe())
print()
print('Some statistic about lenght of visit and number of visit pages for visitor.')
display(visitors[['Length_seconds', 'Length_pagecount']].describe())

Total number of visitors: 15559
Total number of referrers 140
Total number of clicks 38451
Total number of pages 826

Some statistic about times on the page and the page score.


Unnamed: 0,TimeOnPage,PageScore
count,38451.0,38451.0
mean,72.412421,143.092975
std,114.640528,260.595877
min,30.0,30.0
25%,30.0,30.0
50%,30.0,62.0
75%,60.0,125.0
max,2640.0,5753.0



Some statistic about lenght of visit and number of visit pages for visitor.


Unnamed: 0,Length_seconds,Length_pagecount
count,15559.0,15559.0
mean,128.908028,2.471239
std,328.777507,2.998959
min,0.0,1.0
25%,0.0,1.0
50%,0.0,1.0
75%,120.0,3.0
max,5280.0,50.0


In [7]:
# do intersection of files to merge it for next work
data = pd.merge(pd.merge(clicks, visitors, on='VisitID'), search_engine_map, on='Referrer')
print("Data size:", len(data))

Data size: 38450


#### Remove short visits and visits with low number of pages

In [8]:
# remove too short visits
data = data[data.Length_seconds > 7]
print("Data size after remove short (if add some hight number it is same number for example 30):", len(data))
print("This short visits can be the bad meansure of it.")

data = data[data['Length_pagecount'] > 1]
print("Kepp only users visit two or more pages. Size after remove:", len(data))

indexNames = data[ data['PageName'] == 'ww' ].index
# Delete these row indexes from dataFrame
data.drop(indexNames , inplace=True)
indexNames = data[ data['PageName'] == 'wwww' ].index
# Delete these row indexes from dataFrame
data.drop(indexNames , inplace=True)
indexNames = data[ data['PageName'] == 'www' ].index
# Delete these row indexes from dataFrame
data.drop(indexNames , inplace=True)
print("Remove the page with the definitly bad name. Size after remove:", len(data))

data_chain = copy.deepcopy(data)

Data size after remove short (if add some hight number it is same number for example 30): 27041
This short visits can be the bad meansure of it.
Kepp only users visit two or more pages. Size after remove: 27016
Remove the page with the definitly bad name. Size after remove: 27011


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27011 entries, 6 to 38449
Data columns (total 19 columns):
LocalID             27011 non-null int64
PageID              27011 non-null int64
VisitID             27011 non-null int64
PageName            27011 non-null object
CatName             27011 non-null object
CatID               27011 non-null int64
ExtCatName          27011 non-null object
ExtCatID            27011 non-null int64
TopicName           27011 non-null object
TopicID             27011 non-null int64
TimeOnPage          27011 non-null int64
PageScore           27011 non-null int64
SequenceNumber      27011 non-null int64
Referrer            27011 non-null object
Day                 27011 non-null object
Hour                27011 non-null int64
Length_seconds      27011 non-null int64
Length_pagecount    27011 non-null int64
Type                13742 non-null object
dtypes: int64(12), object(7)
memory usage: 4.1+ MB


#### Drop not usefull column

In [10]:
data.drop('LocalID', axis=1, inplace=True)
data.drop('CatID', axis=1, inplace=True)
data.drop('ExtCatID', axis=1, inplace=True)
data.drop('PageID', axis=1, inplace=True)
data.drop('VisitID', axis=1, inplace=True)
data.drop('TopicID', axis=1, inplace=True)

In [11]:
display(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27011 entries, 6 to 38449
Data columns (total 13 columns):
PageName            27011 non-null object
CatName             27011 non-null object
ExtCatName          27011 non-null object
TopicName           27011 non-null object
TimeOnPage          27011 non-null int64
PageScore           27011 non-null int64
SequenceNumber      27011 non-null int64
Referrer            27011 non-null object
Day                 27011 non-null object
Hour                27011 non-null int64
Length_seconds      27011 non-null int64
Length_pagecount    27011 non-null int64
Type                13742 non-null object
dtypes: int64(6), object(7)
memory usage: 2.9+ MB


None

#### Do diskretication, because association rules cant work with countinous. Need convert to ordinal (bins, intervals)

In [12]:
data["TimeOnPage"] = pd.cut(data["TimeOnPage"],20)
data["PageScore"] = pd.cut(data["PageScore"],20)
data["SequenceNumber"] = pd.cut(data["SequenceNumber"],20)
data["Hour"] = pd.cut(data["Hour"],24)
data["Length_seconds"] = pd.cut(data["Length_seconds"],20)
data["Length_pagecount"] = pd.cut(data["Length_pagecount"],20)

In [13]:
display(data.nunique())

PageName            637
CatName               5
ExtCatName           22
TopicName            27
TimeOnPage           15
PageScore            17
SequenceNumber       20
Referrer            122
Day                   7
Hour                 24
Length_seconds       18
Length_pagecount     18
Type                  5
dtype: int64

In [14]:
display(data.head())

Unnamed: 0,PageName,CatName,ExtCatName,TopicName,TimeOnPage,PageScore,SequenceNumber,Referrer,Day,Hour,Length_seconds,Length_pagecount,Type
6,TravelAgency,Info,homepage,In general,"(27.39, 160.5]","(24.277, 316.15]","(0.951, 3.45]",URI_9,Monday,"(0.958, 1.917]","(54.78, 321.0]","(6.8, 9.2]",Catalogue
7,TravelAgency,Info,homepage,In general,"(27.39, 160.5]","(24.277, 316.15]","(0.951, 3.45]",URI_9,Monday,"(0.958, 1.917]","(54.78, 321.0]","(6.8, 9.2]",Catalogue
8,TravelAgency,Info,homepage,In general,"(27.39, 160.5]","(24.277, 316.15]","(5.9, 8.35]",URI_9,Monday,"(0.958, 1.917]","(54.78, 321.0]","(6.8, 9.2]",Catalogue
9,lastminute,Search,Catalog,Lastminute,"(27.39, 160.5]","(24.277, 316.15]","(0.951, 3.45]",URI_9,Monday,"(0.958, 1.917]","(54.78, 321.0]","(6.8, 9.2]",Catalogue
10,Aeolian Islands,Search,Catalog,Lipari,"(27.39, 160.5]","(24.277, 316.15]","(5.9, 8.35]",URI_9,Monday,"(0.958, 1.917]","(54.78, 321.0]","(6.8, 9.2]",Catalogue


In [15]:
def convert_pandas_to_list_of_lists(data_frame):
    return [[col + "=" + str(row[col]) for col in list(data_frame)] for index, row in data_frame.iterrows()]

In [16]:
def print_topn_rules(rules, n, contain):
    cnt = 0
    for rule in rules[:n]:
        for i in rule[0]:
            if contain in i:
                print("{} -> {} - SUPPORT: {} - CONFIDENCE: {}".format(rule[0], rule[1], rule[3], rule[2]))
                break
        cnt += 1
        if cnt == n:
            break

In [17]:
data_for_asociation_rules = convert_pandas_to_list_of_lists(data)

In [18]:
n_print = 10
result, resultc = apriori(data_for_asociation_rules, 0.05)

In [19]:
print_topn_rules(generate_rules(result, resultc, 0.05, metric="confidence"), n_print, '=')

['PageScore=(24.277, 316.15]'] -> ['TimeOnPage=(27.39, 160.5]'] - SUPPORT: 0.849 - CONFIDENCE: 0.974
['TimeOnPage=(27.39, 160.5]'] -> ['PageScore=(24.277, 316.15]'] - SUPPORT: 0.849 - CONFIDENCE: 0.969
['SequenceNumber=(0.951, 3.45]'] -> ['PageScore=(24.277, 316.15]'] - SUPPORT: 0.519 - CONFIDENCE: 0.914
['PageScore=(24.277, 316.15]'] -> ['SequenceNumber=(0.951, 3.45]'] - SUPPORT: 0.519 - CONFIDENCE: 0.595
['SequenceNumber=(0.951, 3.45]', 'TimeOnPage=(27.39, 160.5]'] -> ['PageScore=(24.277, 316.15]'] - SUPPORT: 0.496 - CONFIDENCE: 1.0
['PageScore=(24.277, 316.15]', 'SequenceNumber=(0.951, 3.45]'] -> ['TimeOnPage=(27.39, 160.5]'] - SUPPORT: 0.496 - CONFIDENCE: 0.956
['SequenceNumber=(0.951, 3.45]'] -> ['TimeOnPage=(27.39, 160.5]'] - SUPPORT: 0.496 - CONFIDENCE: 0.874
['PageScore=(24.277, 316.15]', 'TimeOnPage=(27.39, 160.5]'] -> ['SequenceNumber=(0.951, 3.45]'] - SUPPORT: 0.496 - CONFIDENCE: 0.584
['TimeOnPage=(27.39, 160.5]'] -> ['SequenceNumber=(0.951, 3.45]'] - SUPPORT: 0.496 - CONFI

In [20]:
print_topn_rules(generate_rules(result, resultc, 0.05, metric="confidence"), n_print, 'TimeOnPage')

['TimeOnPage=(27.39, 160.5]'] -> ['PageScore=(24.277, 316.15]'] - SUPPORT: 0.849 - CONFIDENCE: 0.969
['SequenceNumber=(0.951, 3.45]', 'TimeOnPage=(27.39, 160.5]'] -> ['PageScore=(24.277, 316.15]'] - SUPPORT: 0.496 - CONFIDENCE: 1.0
['PageScore=(24.277, 316.15]', 'TimeOnPage=(27.39, 160.5]'] -> ['SequenceNumber=(0.951, 3.45]'] - SUPPORT: 0.496 - CONFIDENCE: 0.584
['TimeOnPage=(27.39, 160.5]'] -> ['SequenceNumber=(0.951, 3.45]'] - SUPPORT: 0.496 - CONFIDENCE: 0.566


#### Chains

In [21]:
chained = {i: sorted([[i[1][0], i[1][1]] for i in
                data_chain.loc[data_chain['VisitID'] == i][['SequenceNumber', 'PageName']].iterrows()],
                              key=lambda x: (x[0]), reverse=False) for i in data_chain['VisitID'].unique()}

In [22]:
print("The longest chain of pages:")
print()
for k in sorted(chained, key=lambda k: len(chained[k]), reverse=True)[:5]:
    for i in range(len(chained[k])-1):
        print(chained[k][i][1] + " -> ", end='', sep='')
    print(chained[k][i][1])
    print()
print("The shortest chain of pages:")
print()
for k in sorted(chained, key=lambda k: len(chained[k]), reverse=False)[:5]:
    for i in range(len(chained[k])-1):
        print(chained[k][i][1] + " -> ", end='', sep='')
    print(chained[k][i][1])
    print()

The longest chain of pages:

TravelAgency -> light hiking -> sightseeing tours -> England France Ireland capital Dublin Paris London sightseeing tour -> Swiss Alps Valais -> Ireland &#39;green island&#39; -> England and London residence of English kings -> Baltics small circle pobaltíím a tour -> Lofoten Norway Sweden United Arctic Circle -> Norway Norwegian fjords -> francie bretaň daughter Ocean -> Russia Saint Petersburg jewel of Russia and the Republic pobaltíské -> Lofoten Norway Sweden United Arctic Circle -> kiev Kiev treasures and Ruthenia -> Swiss beauty of Switzerland and the Alpine giants -> Ireland &#39;green island&#39; -> Norway Norwegian fjords -> Poland Great Circle Poland -> Scotland United circuit Scotland -> large briánie Ireland Nature sights History -> Germany Denmark Hanseatic cities of the Baltic and Danish Kingdom -> Russia Moscow Novgorod St. Petersburg -> Ireland &#39;green island&#39; -> Turkey&#39;s west coast -> Serbia Montenegro Bosnia and Herzegovina trea