# Association Rules


In [1]:
import pymongo

from sklearn.cluster import KMeans
from sklearn import preprocessing
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import one_hot

import numpy as np
import pandas as pd
import json
import dateparser

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", palette="muted")
%matplotlib inline

## Pymongo Setup

In [3]:
cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
client = pymongo.MongoClient(cluster_uri)
orders = client['coursera-agg']['orders']

# Recolhendo dados do MongoDB

Vamos precisar construir um dataframe "one-hot enconded". Ou seja, para cada documento, vamos converter a informação em um array de compras disso:

```python
{
    ...,
    "purchases": [
        {
          "description": "WHITE WIRE EGG HOLDER",
          "quantity": 36,
          "stock_code": "84880",
          "unit_price": 4.95
        },
        {
          "description": "JUMBO  BAG BAROQUE BLACK WHITE",
          "quantity": 100,
          "stock_code": "85099C",
          "unit_price": 1.65
        },
        {
          "description": "JUMBO BAG RED RETROSPOT",
          "quantity": 100,
          "stock_code": "85099B",
          "unit_price": 1.65
        }
      ],
  }
  ```
para isso:
  ```python
{
    "84880": 1,
    "85099C": 1,
    "85099B": 1,
}
```

## Pipeline

In [4]:
order_projection = {
    "$replaceRoot": {
            "newRoot":  {
                "$arrayToObject": {
                    "$map": {
                        "input": "$purchases",
                        "in": {
                            "k": "$$this.stock_code",
                            "v": 1
                        }
                    }
                }
            }
    }
            
}

## Construindo o Pipeline

É isso! Vamos utilizar o nosso stage único:

In [5]:
pipeline = [
    order_projection
]

## Contruindo o pd.DataFrame a partir do MongoDB

In [6]:
df = pd.DataFrame.from_dict(list(orders.aggregate(pipeline)))
df.head()

Unnamed: 0,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [7]:
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Associação

### Apriori

Vamos utilizar o algoritmo `apriori`, disponível no pacor `mlxtend` para extrair sets de itens frequentes.

In [8]:
assocs = apriori(df, min_support=0.02, use_colnames=True)

In [9]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 5):
    assocs =assocs.sort_values(by='support', ascending=False)
    display(assocs)

Unnamed: 0,support,itemsets
200,0.11358,[85123A]
197,0.086912,[85099B]
79,0.08469,[22423]
189,0.078083,[84879]
166,0.077542,[47566]
7,0.067271,[20725]
112,0.060484,[22720]
9,0.059823,[20727]
143,0.058983,[23203]
74,0.057601,[22383]


## Regras de Associação


In [10]:
rules = association_rules(assocs, metric="lift", min_threshold=3)

In [11]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 8):
    display(rules.sort_values(by='lift', ascending=False))

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,...,confidence,lift,leverage,conviction
78,"(22698, 22699)",(22697),0.023004,0.036759,...,0.890339,24.221015,0.019636,8.783841
79,(22697),"(22698, 22699)",0.036759,0.023004,...,0.55719,24.221015,0.019636,2.206352
77,"(22697, 22699)",(22698),0.02859,0.029611,...,0.716387,24.192941,0.019635,3.421518
80,(22698),"(22697, 22699)",0.029611,0.02859,...,0.691684,24.192941,0.019635,3.150691
36,(22697),(22698),0.036759,0.029611,...,0.660131,22.293137,0.023177,2.855182
37,(22698),(22697),0.029611,0.036759,...,0.819473,22.293137,0.023177,5.335706
76,"(22697, 22698)",(22699),0.024266,0.040723,...,0.844059,20.726763,0.019494,6.151553
81,(22699),"(22697, 22698)",0.040723,0.024266,...,0.50295,20.726763,0.019494,1.96305
5,(22699),(22697),0.040723,0.036759,...,0.702065,19.099148,0.027093,3.233057
4,(22697),(22699),0.036759,0.040723,...,0.777778,19.099148,0.027093,4.316746


In [12]:
query = {
    "$match": {
        "_id.stock_code": { "$in": ["22697", "22698", "22699"]}
    }
}

project = {
    "$project": { "_id": 0, "purchases.stock_code": 1, "purchases.description": 1}
}

pipeline = [
    {
        "$unwind": "$purchases"
    },
    {
        "$group": {
            "_id": {
                "stock_code": "$purchases.stock_code",
                "description": "$purchases.description"
            }
            
        }
    },
    query
]
display(list(orders.aggregate(pipeline)))

[{'_id': {'description': 'ROSES REGENCY TEACUP AND SAUCER',
   'stock_code': '22699'}},
 {'_id': {'description': 'GREEN REGENCY TEACUP AND SAUCER',
   'stock_code': '22697'}},
 {'_id': {'description': 'PINK REGENCY TEACUP AND SAUCER',
   'stock_code': '22698'}}]