In [84]:
import pandas as pd
search_engine_map = pd.read_csv("./wum_dataset_hw/search_engine_map.csv")
clicks = pd.read_csv("./wum_dataset_hw/clicks.csv")
visitors = pd.read_csv("./wum_dataset_hw/visitors.csv")

In [39]:
'''
Referrer - anonymized referrer
Type - type of the referrer domain
'''
search_engine_map.head(3)

Unnamed: 0,Referrer,Type
0,URI_0,
1,URI_1,Other
2,URI_2,Other


In [83]:
'''
LocalID - local identifier of an event
PageID - identifier of a visited page
VisitID - session identifier
PageName - page label
CatName, CatID - page type (Navigation), general information
ExtCat,ExtCatID - page type (Content), more specific information
TopicName, TopicID - topic
TimeOnPage - time spent on the page. Last page of the session = 30s.
PageScore - weight of the page using the following heuristic: (ln(o)+1)*t
SequenceNumber - page order within a session
'''
clicks.head(3)

Unnamed: 0,LocalID,PageID,VisitID,PageName,CatName,CatID,ExtCatName,ExtCatID,TopicName,TopicID,TimeOnPage,PageScore,SequenceNumber
0,648460,3044,1185,TravelAgency,Info,1,homepage,1,In general,1000,30,30,1
1,648461,3045,1185,tours with tents,Search,2,Catalog,5,Tourism,1,30,50,2
2,648462,3046,1185,hiking,Search,2,Catalog,5,Hiking,3,30,62,3


In [9]:
'''
VisitID - session identifier
Referrer - anonymized referrer
Day - day of the visit
Hour - hour of the visit
Length_seconds - visit length in seconds
Length_pagecount - visit length as number of visited pages
'''
visitors.head(1)

Unnamed: 0,VisitID,Referrer,Day,Hour,Length_seconds,Length_pagecount
0,1185,URI_9,Sunday,23,0,6
1,1186,URI_38,Sunday,23,300,2


In [85]:
# filtering
mean_lenght_visit = visitors["Length_seconds"].mean()
mean_lenght_visit = mean_lenght_visit / 2 # to have more data

filtered_out = 0

for i in range(0, clicks.__len__()):
    row_visitors = visitors.loc[visitors["VisitID"] == clicks.get_value(i,"VisitID")]
    if len(row_visitors) >= 1:
        if visitors.loc[visitors["VisitID"] == clicks.get_value(i,"VisitID")]["Length_seconds"].mean() < 10:
            clicks.drop(i, inplace=True)
            filtered_out = filtered_out + 1
    else:
        clicks.drop(i, inplace=True)
        filtered_out = filtered_out + 1

In [86]:
filtered_out

11410

In [147]:
len(clicks)

27041

In [100]:
main_corversion = clicks.loc[(clicks["PageName"] == "APPLICATION") 
                            | (clicks["PageName"] == "CATALOG")
                            | (clicks["CatName"] == "APPLICATION")
                            | (clicks["CatName"] == "CATALOG")]

In [101]:
micro_corversion = clicks.loc[(clicks["PageName"] == "DISCOUNT") 
                            | (clicks["PageName"] == "HOWTOJOIN")
                            | (clicks["PageName"] == "INSURANCE")
                            | (clicks["PageName"] == "WHOWEARE")]

In [102]:
len(micro_corversion)

442

In [103]:
len(main_corversion)

497

In [10]:
from collections import Counter
 
def frequentItems(transactions, support):
    counter = Counter()
    for trans in transactions:
        counter.update(frozenset([t]) for t in trans)
    return set(item for item in counter if counter[item]/len(transactions) >= support), counter
 
def generateCandidates(L, k):
    candidates = set()
    for a in L:
        for b in L:
            union = a | b
            if len(union) == k and a != b:
                candidates.add(union)
    return candidates
 
def filterCandidates(transactions, itemsets, support):
    counter = Counter()
    for trans in transactions:
        subsets = [itemset for itemset in itemsets if itemset.issubset(trans)]
        counter.update(subsets)
    return set(item for item in counter if counter[item]/len(transactions) >= support), counter
 
def apriori(transactions, support):
    result = list()
    resultc = Counter()
    candidates, counter = frequentItems(transactions, support)
    result += candidates
    resultc += counter
    k = 2
    while candidates:
        candidates = generateCandidates(candidates, k)
        candidates,counter = filterCandidates(transactions, candidates, support)
        result += candidates
        resultc += counter
        k += 1
    resultc = {item:(resultc[item]/len(transactions)) for item in resultc}
    return result, resultc


In [148]:
class Rule():
    def __init__(self, left, right, conf, conv, supp):
        self.left = left
        self.right = right
        self.confidence = round(conf, 3)
        self.conviction = round(conv, 3)
        self.support = round(supp, 3)
        
    def __str__(self):
        res = "Rule: ("
        res = "(" + ",".join(self.left) + ") ---> (" + ",".join(self.right) + ")"
        res = res + "), confidence: " + str(self.confidence) + ", conviction: "
        res = res + str(self.conviction) + ", support: " + str(self.support)
        return res

In [133]:
def generateRules(frequentItemsets, supports, minConfidence):
    rules = []
    for s in frequentItemsets:
        for subset in frozenset(map(frozenset,list_powerset(list(s)))):
            if not subset:
                continue
            tmpSet = set(s)
            tmpSub = set(subset)
            for it in tmpSub:
                tmpSet.remove(it)
            if not tmpSet:
                continue
            tmpSetF = frozenset(tmpSet)
            confidence = supports[s]/supports[tmpSetF]
            if confidence == 1:
                conviction = INF
            else:
                conviction = (1-supports[subset])/(1-confidence)
            if confidence >= minConfidence:
                rule = Rule(tmpSet, tmpSub, confidence, conviction, supports[s])
                rules.append(rule)
    return rules

In [12]:
def list_powerset(lst):
    result = [[]]
    for x in lst:
        result.extend([subset + [x] for subset in result])
    return result

In [115]:
def getDatasetByVisitId(rowName):
    old_visit_id = int(clicks.head(1)["VisitID"])
    dataset = []
    itemset = []
    for index, row in clicks.iterrows():
        new_visit_id = int(row["VisitID"])
        if new_visit_id == old_visit_id:
            itemset.append(str(row[str(rowName)]))
        else:
            dataset.append(list(set(itemset)))
            itemset = []
            itemset.append(str(row[str(rowName)]))
            old_visit_id = new_visit_id

    return dataset

In [116]:
dataset_category_name = getDatasetByVisitId("CatName")
dataset_page_name = getDatasetByVisitId("PageName")
dataset_ext_category_name = getDatasetByVisitId("ExtCatName")
dataset_topic = getDatasetByVisitId("TopicName")

In [153]:
frequentItemsets, supports = apriori(dataset_category_name, 0.2)
for f in frequentItemsets:
    print("{} - {}".format(f,supports[f]))
print("---------------")
    
for rule in generateRules(frequentItemsets, supports, 0.2):
    print (rule.__str__())

frozenset({'Search'}) - 0.8984908372260151
frozenset({'Trip'}) - 0.7024793388429752
frozenset({'Info'}) - 0.6582824290334172
frozenset({'Info', 'Search'}) - 0.5955803090190442
frozenset({'Trip', 'Search'}) - 0.6446280991735537
frozenset({'Trip', 'Info'}) - 0.43226733740567735
frozenset({'Trip', 'Info', 'Search'}) - 0.41268415379087314
---------------
(Info) ---> (Search)), confidence: 0.905, conviction: 1.066, support: 0.596
(Search) ---> (Info)), confidence: 0.663, conviction: 1.014, support: 0.596
(Trip) ---> (Search)), confidence: 0.918, conviction: 1.233, support: 0.645
(Search) ---> (Trip)), confidence: 0.717, conviction: 1.053, support: 0.645
(Info) ---> (Trip)), confidence: 0.657, conviction: 0.867, support: 0.432
(Trip) ---> (Info)), confidence: 0.615, conviction: 0.888, support: 0.432
(Trip,Search) ---> (Info)), confidence: 0.64, conviction: 0.95, support: 0.413
(Trip) ---> (Info,Search)), confidence: 0.587, conviction: 0.98, support: 0.413
(Trip,Info) ---> (Search)), confiden

In [154]:
frequentItemsets, supports = apriori(dataset_page_name, 0.1)
for f in frequentItemsets:
    print("{} - {}".format(f,supports[f]))
print("---------------")
    
for rule in generateRules(frequentItemsets, supports, 0.2):
    print (rule.__str__())

frozenset({'TravelAgency'}) - 0.6196550485088035
frozenset({'sightseeing tours'}) - 0.10168882500898312
frozenset({'lastminute'}) - 0.19331656485806684
frozenset({'hiking'}) - 0.14678404599353215
frozenset({'tours and holiday comes into hotel'}) - 0.12378727991376212
frozenset({'tours with tents'}) - 0.15001796622349983
frozenset({'Far tours'}) - 0.11013295005389867
frozenset({'tours with tents', 'TravelAgency'}) - 0.1063600431189364
frozenset({'hiking', 'TravelAgency'}) - 0.10366510959396334
frozenset({'lastminute', 'TravelAgency'}) - 0.12881782249371182
---------------
(tours with tents) ---> (TravelAgency)), confidence: 0.709, conviction: 1.307, support: 0.106
(hiking) ---> (TravelAgency)), confidence: 0.706, conviction: 1.295, support: 0.104
(lastminute) ---> (TravelAgency)), confidence: 0.666, conviction: 1.14, support: 0.129
(TravelAgency) ---> (lastminute)), confidence: 0.208, conviction: 1.018, support: 0.129


In [155]:
frequentItemsets, supports = apriori(dataset_ext_category_name, 0.1)
for f in frequentItemsets:
    print("{} - {}".format(f,supports[f]))
print("---------------")

for rule in generateRules(frequentItemsets, supports, 0.2):
    print (rule.__str__())

frozenset({'undetected'}) - 0.7024793388429752
frozenset({'Catalog'}) - 0.826805605461732
frozenset({'homepage'}) - 0.6196550485088035
frozenset({'extension'}) - 0.12414660438375853
frozenset({'Catalog', 'undetected'}) - 0.5896514552641035
frozenset({'homepage', 'undetected'}) - 0.40567732662594325
frozenset({'Catalog', 'homepage'}) - 0.5228171038447719
frozenset({'Catalog', 'homepage', 'undetected'}) - 0.3659719726913403
---------------
(Catalog) ---> (undetected)), confidence: 0.713, conviction: 1.037, support: 0.59
(undetected) ---> (Catalog)), confidence: 0.839, conviction: 1.078, support: 0.59
(homepage) ---> (undetected)), confidence: 0.655, conviction: 0.862, support: 0.406
(undetected) ---> (homepage)), confidence: 0.577, conviction: 0.9, support: 0.406
(homepage) ---> (Catalog)), confidence: 0.844, conviction: 1.108, support: 0.523
(Catalog) ---> (homepage)), confidence: 0.632, conviction: 1.034, support: 0.523
(homepage,undetected) ---> (Catalog)), confidence: 0.902, convicti

In [156]:
frequentItemsets, supports = apriori(dataset_topic, 0.1)
for f in frequentItemsets:
    print("{} - {}".format(f,supports[f]))
print("---------------")
for rule in generateRules(frequentItemsets, supports, 0.2):
    print (rule.__str__())

frozenset({'Tourism'}) - 0.23014732303269853
frozenset({'Tours with accommodation'}) - 0.17750628817822495
frozenset({'Neni'}) - 0.1223499820337765
frozenset({'Hiking'}) - 0.1697808120733022
frozenset({'Lastminute'}) - 0.19331656485806684
frozenset({'exotica'}) - 0.1428314768235717
frozenset({'In general'}) - 0.7326625943226733
frozenset({'In general', 'Tours with accommodation'}) - 0.12576356449874238
frozenset({'exotica', 'In general'}) - 0.10564139417894358
frozenset({'Lastminute', 'In general'}) - 0.14732303269852676
frozenset({'In general', 'Hiking'}) - 0.12558390226374416
frozenset({'In general', 'Tourism'}) - 0.1703197987782968
---------------
(Tours with accommodation) ---> (In general)), confidence: 0.709, conviction: 0.917, support: 0.126
(exotica) ---> (In general)), confidence: 0.74, conviction: 1.027, support: 0.106
(In general) ---> (Lastminute)), confidence: 0.201, conviction: 1.01, support: 0.147
(Lastminute) ---> (In general)), confidence: 0.762, conviction: 1.124, sup