# Reglas de asociación sobre dominios

In [1]:
#Import modules
import gzip
import json
import gc
import math
import random
import numpy as np
import pandas as pd 

from collections import Counter, defaultdict
from tqdm import tqdm
from pathlib import Path
from sklearn.model_selection import train_test_split
from mlxtend.preprocessing import TransactionEncoder

## Carga de datos

In [7]:
#Auxiliary function
def jl_to_list2(fname):
    output = []
    with gzip.open(fname, 'rb') as f:
        for line in f:     
            output.append(list(np.unique(np.array([event['event_info'] for event in json.loads(line)['user_history'] if event['event_type'] != 'search']))))
    return output

#Auxiliary function
def jl_to_list(fname):
    output = []
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [3]:
path = Path("data")

In [4]:
train_dataset = jl_to_list2(path/'train_dataset.jl.gz')

In [None]:
item_data = jl_to_list(path/'item_data.jl.gz')
metadata = {x['item_id']:x for x in item_data} #We create a dictionary to access easily access the item metadata
all_items = list(metadata.keys())

In [52]:
a = [x['domain_id'] for x in item_data]

In [56]:
aa  = list(set(a))

Mapeo dominio a ID

In [57]:
domain2id = {aa[i]: i for i in range(len(aa))}

In [85]:
id2domain = {i: aa[i] for i in range(len(aa))}

In [68]:
domains_dataset = []
for row in train_dataset:
    aux = list(set([domain2id[metadata[x]['domain_id']] for x in row]))
    if len(aux) > 1:
        domains_dataset.append(aux)

In [84]:
len(train_dataset)

413163

## Preprocesamiento dataset para utilizar mlxtend

In [98]:
te = TransactionEncoder()
te_ary = te.fit(domains_dataset).transform(domains_dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)


In [99]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7882,7883,7884,7885,7888,7889,7890,7891,7892,7893
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253370,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
253371,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
253372,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
253373,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [100]:
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

## FPGrowth

Se observa que la cantidad de soporte mínimo para lograr obtener itemsets frecuentes es muy baja

In [101]:
frequent_itemsets = fpgrowth(df, min_support=0.001)

In [102]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.003363,(692)
1,0.013664,(241)
2,0.001318,(1667)
3,0.037202,(2234)
4,0.036783,(2230)
...,...,...
1703,0.001121,"(5190, 1543)"
1704,0.001066,"(2663, 7279)"
1705,0.001160,"(2599, 2983)"
1706,0.001145,"(2979, 2599)"


## Reglas de asociación

In [131]:
rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.01).sort_values(by=['lift'], ascending=False)

Se aprecia que el soporte de las reglas es extremadamente bajo. No es útil seguir este camino ya que otras opciones más simples como utilizar los dominios ya vistos por el usuario sabemos que funcionan bastante bien dado los scores obtenidos con los baselines de dominio más visitado.

Muchas reglas son redundantes/circulares

In [132]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1,(5991),(5586),0.02934,0.050636,0.011051,0.376648,7.438281,0.009565,1.522997
0,(5586),(5991),0.050636,0.02934,0.011051,0.218239,7.438281,0.009565,1.241632
6,(6963),(2599),0.033014,0.052854,0.010664,0.323013,6.111358,0.008919,1.399059
7,(2599),(6963),0.052854,0.033014,0.010664,0.201762,6.111358,0.008919,1.211401
14,(5586),(589),0.050636,0.037904,0.011114,0.219486,5.790521,0.009195,1.232643
15,(589),(5586),0.037904,0.050636,0.011114,0.293211,5.790521,0.009195,1.343207
17,(2290),(3962),0.078804,0.024493,0.010143,0.128712,5.254995,0.008213,1.119615
16,(3962),(2290),0.024493,0.078804,0.010143,0.414115,5.254995,0.008213,1.572316
11,(1541),(3555),0.024884,0.111396,0.013719,0.551308,4.94908,0.010947,1.980434
10,(3555),(1541),0.111396,0.024884,0.013719,0.123153,4.94908,0.010947,1.112071


In [133]:
def show_rules(antecedent, consequent):
    print(id2domain[antecedent], '--->', id2domain[consequent])

Ejemplos de reglas obtenidas

In [134]:
for index, row in rules.iterrows():
    show_rules(list(row['antecedents'])[0], list(row['consequents'])[0])

MLB-CATS_AND_DOGS_FOODS ---> MLM-RADIO_FREQUENCY_MACHINES
MLM-RADIO_FREQUENCY_MACHINES ---> MLB-CATS_AND_DOGS_FOODS
MLB-MOTORCYCLE_HANDLEBAR_YOKES ---> MLM-TELESCOPIC_POST_SHORES
MLM-TELESCOPIC_POST_SHORES ---> MLB-MOTORCYCLE_HANDLEBAR_YOKES
MLM-RADIO_FREQUENCY_MACHINES ---> MLM-PH_METERS
MLM-PH_METERS ---> MLM-RADIO_FREQUENCY_MACHINES
MLB-BUBBLE_LEVELS ---> MLB-PHOTOGRAPHS
MLB-PHOTOGRAPHS ---> MLB-BUBBLE_LEVELS
MLB-SEWING_MACHINE_PRESSER_FEET ---> MLB-SIDE_TABLES
MLB-SIDE_TABLES ---> MLB-SEWING_MACHINE_PRESSER_FEET
MLB-BUBBLE_LEVELS ---> MLB-FISHING_REELS
MLB-FISHING_REELS ---> MLB-BUBBLE_LEVELS
MLB-BUBBLE_LEVELS ---> MLM-PH_METERS
MLM-PH_METERS ---> MLB-BUBBLE_LEVELS
MLB-BUBBLE_LEVELS ---> MLM-RADIO_FREQUENCY_MACHINES
MLM-RADIO_FREQUENCY_MACHINES ---> MLB-BUBBLE_LEVELS
MLB-BUBBLE_LEVELS ---> MLB-SIDE_TABLES
MLB-SIDE_TABLES ---> MLB-BUBBLE_LEVELS
