## Apriori algorithm and Association rules

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from pandas.plotting import parallel_coordinates

In [2]:
# Load data
pageviews = pd.read_csv('./../databases/csv-data/cleaned.csv', 
                        parse_dates=['datetime'])[['uid', 'page', 'pageURL', 'datetime']]
# Create a column ranking pageview sequence
pageviews['pageview_sequence'] = pageviews.groupby(['uid'])['datetime'].rank(method='first')

In [3]:
def get_pageviews_list(df):
    pageviews_list = []
    for uid in df.uid.unique():
        temp = df[df['uid'] == uid].sort_values(by='pageview_sequence')
        pageviews_list.append(list(temp.page.values))
    return pageviews_list

In [4]:
pageviews_list = get_pageviews_list(pageviews)

In [5]:
pageviews_list[0:5]

[['Taking someone to small claims court - The Mix',
  'Being taken to small claims court - The Mix'],
 ['Fingering a girl - TheMix.org.uk',
  'Why does she bleed when I finger her? - TheMix.org.uk'],
 ['General chit chat (OP GreenTea) - Page 542 — The Mix Support Community',
  'The Mix Community',
  'Speak to Our Team - The Mix',
  'The Mix Counselling Service - The Mix',
  'Thank you - The Mix'],
 ['The Mix Counselling Service - The Mix',
  'Get Support - The Mix',
  'Launching: The Kindness Diary - The Mix'],
 ['Jobs - The Mix',
  'Job interview tips - The Mix',
  'What are the symptoms of a coke comedown? - The Mix']]

In [6]:
# Instantiate encoder and identify unique items
encoder = TransactionEncoder().fit(pageviews_list)

# One-hot encode transactions
onehot = encoder.transform(pageviews_list)

# Convert one-hot encoded data to DataFrame
onehot = pd.DataFrame(onehot, columns=encoder.columns_)

TypeError: '<' not supported between instances of 'str' and 'float'

In [None]:
frequent_itemsets = apriori(onehot, min_support=0.02, use_colnames=True, max_len=2)
rules = association_rules(frequent_itemsets, metric='support', min_threshold=0.00)

In [None]:
rules.head()

In [None]:
# Replace frozen sets with strings
rules['antecedents'] = rules['antecedents'].apply(lambda a: ','.join(list(a)))
rules['consequents'] = rules['consequents'].apply(lambda a: ','.join(list(a)))
# Transform data to matrix format and generate heatmap
pivot = rules.pivot_table(index='consequents', columns='antecedents', values='lift')

sns.set(font_scale=0.8)
sns.heatmap(pivot, cmap='YlGnBu', annot=True, annot_kws={"fontsize":6})
# Format and display plot
plt.yticks(rotation=0)
save_fig('heatmap')
plt.show()

In [None]:
frequent_itemsets = apriori(onehot, min_support=0.02, use_colnames=True, max_len=2)
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=2.5)

In [None]:
# Convert rules to coordinates. 
rules['antecedent'] = rules['antecedents'].apply(lambda antecedent: list(antecedent)[0]) 
rules['consequent'] = rules['consequents'].apply(lambda consequent: list(consequent)[0]) 
rules['rule'] = rules.index 

In [None]:
# Define coordinates and label
coords = rules[['antecedent','consequent','rule']] 
coords.head()

In [None]:
plt.figure(figsize=(4,6))
parallel_coordinates(coords, 'rule', colormap='YlGnBu',)
plt.legend([])
save_fig('parallel')
plt.show()

In [None]:
import pandas as pd
pageviews = pd.read_csv("./cleaned.csv", parse_dates=['datetime'])[['uid', 'page', 'pageURL', 'datetime']]
pageviews.uid.nunique()

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates

PROJECT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_name, tight_layout=False, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_name + "." + fig_extension)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution, bbox_inches="tight")
    print("Figure saved,", fig_name)

In [None]:
# Load data
pageviews = pd.read_csv("./cleaned.csv", parse_dates=['datetime'])[['uid', 'page', 'pageURL', 'datetime']]

In [None]:
pageviews