In [None]:
!pip install mlxtend

In [None]:
import pandas as pd
import numpy as np
import mlxtend
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('retail_dataset.csv', sep=',') 
## Print top 5 rows 
df.head(5)

In [None]:
items = (df['0'].unique())
items

To make use of the __apriori__ module given by `mlxtend` library, we need to convert the dataset according to it’s liking. The __apriori__ module requires a dataframe that has either 0 and 1 or True and False as data (This is called One Hot Encoding - [read here](https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/). The data we have is all string (name of items), we need to One Hot Encode the data.

### Custom One Hot Encoding

In [None]:
encoded_vals = []
#def custom():
for index, row in df.iterrows():
    labels = {}
    uncommons = list(set(items) - set(row))
    commons = list(set(items).intersection(row))
    for uc in uncommons:
        labels[uc] = 0
    for com in commons:
        labels[com] = 1
    encoded_vals.append(labels)
encoded_vals[0]
ohe_df = pd.DataFrame(encoded_vals)
ohe_df.head(5)

## Documentation of apriori module
`apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0, low_memory=False)`

* `df` : One-Hot-Encoded DataFrame or DataFrame that has 0 and 1 or True and False as values
* `min_support` : Floating point value between 0 and 1 that indicates the minimum support required for an itemset to be selected. (number of observation with item / total observations)
* `use_colnames` : This allows to preserve column names for itemset making it more readable.
* `max_len` : Max length of itemset generated. If not set, all possible lengths are evaluated.
* `verbose` : Shows the number of iterations if >= 1 and low_memory is True. If =1 and low_memory is False , shows the number of combinations.
* `low_memory` : If True, uses an iterator to search for combinations above min_support. Note that while low_memory=True should only be used for large dataset if memory resources are limited, because this implementation is approx. 3–6x slower than the default.

In [None]:
freq_items = apriori(ohe_df, min_support=0.2, use_colnames=True, verbose=1)
freq_items.head(7)

In [None]:
rules = association_rules(freq_items, metric="confidence", min_threshold=0.6)
rules.head()

## Visualizing results
### Support vs Confidence

In [None]:
plt.scatter(rules['support'], rules['confidence'], alpha=0.5)
plt.xlabel('support')
plt.ylabel('confidence')
plt.title('Support vs Confidence')
plt.show()

### Support vs Lift

In [None]:
plt.scatter(rules['support'], rules['lift'], alpha=0.5)
plt.xlabel('support')
plt.ylabel('lift')
plt.title('Support vs Lift')
plt.show()

### Lift vs Confidence

In [None]:
fit = np.polyfit(rules['lift'], rules['confidence'], 1)
fit_fn = np.poly1d(fit)
plt.plot(rules['lift'], rules['confidence'], 'yo', rules['lift'], 
 fit_fn(rules['lift']))

## Real groceries dataset
The dataset is the result of preprocessing of a real dataset on grocery transactions from the arules R library. It contains actual transactions at a grocery outlet over 30 days. The dataset can be found [here](https://drive.google.com/file/d/1SAM7xAO5ZTuw5CNWmU1C5S3Oikq0rjdO/view?usp=sharing).

In [None]:
data_file = "../data_sets/groceries_matrix.csv" 

In [None]:
df = pd.read_csv(data_file, sep=',') 
## Print top 5 rows 
df.head(5)

In [None]:
freq_items = apriori(df, min_support=0.05, use_colnames=True, verbose=1)
print(len(freq_items))
print(freq_items.head(10))
freq_items.tail(10)

In [None]:
rules = association_rules(freq_items, metric="confidence", min_threshold=0.02)
print(len(rules))
rules.head(10)

The network graph below shows associations between selected items. Larger circles imply higher support, while red circles imply higher lift:

<figure>
    <img src="images/association-rules-network-graph2.png" title="Associations between selected items. Visualized using the arulesViz R library.">
    <figcaption>Fig.1 - Associations between selected items. Visualized using the arulesViz R library.</figcaption>
</figure>

The figure is from this  [post](https://www.kdnuggets.com/2016/04/association-rules-apriori-algorithm-tutorial.html).

View this [post](https://www.kdnuggets.com/2016/04/association-rules-apriori-algorithm-tutorial.html)

https://rpubs.com/aru0511/GroceriesDatasetAssociationAnalysis