In [None]:
# MARKET BASKET ANALYSIS FOR LOOKING AT ASSOCIATION RULES IN PYTHON

## BASED ON CODE FROM MOFFITT (2017) - Practical Business Python


import pandas as pd

# FOLLOWING PACKAGE BEST IMPORTED AND INSTALLED VIA CONDA PROMPT
# conda install -c conda-forge mlxtend

from mlxtend.frequent_patterns import apriori            # EASY ASSOCIATION RULES PACKAGE FROM RABST
from mlxtend.frequent_patterns import association_rules

import seaborn as sns  # PROVIDES TRELLIS AND SMALL MULTIPLE PLOTTING
import matplotlib.pyplot as plt  # 2D plotting


In [None]:
# OBTAIN data from UCI machine learning archive

df = pd.read_excel('http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx')
df.head()
#df()

In [None]:
# SCRUB data with some minor transformations

df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]

In [None]:
# EXPLORE data with some visuals

# BARPLOT SHOWING NUMBER ORDERS PER COUNTRY
sns.countplot(y="Country", data=df, palette="Blues_d")
plt.show()

In [None]:
## BARPLOT MINUS UNITED KINGDOM
dfukless = df[df.Country != "United Kingdom"]
sns.countplot(y="Country", data=dfukless, palette="Blues_d")
plt.show()

In [None]:
## BREAKING OUT FOR JUST FRANCE

basket = (df[df['Country'] =="France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [None]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)
basket_sets.drop('POSTAGE', inplace=True, axis=1)

In [None]:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)

In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

In [None]:
rules[ (rules['lift'] >= 8) &
       (rules['confidence'] >= 0.9) ]

In [None]:
## NOW LOOK AT GERMANY

basket2 = (df[df['Country'] =="Germany"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

basket_sets2 = basket2.applymap(encode_units)
basket_sets2.drop('POSTAGE', inplace=True, axis=1)
frequent_itemsets2 = apriori(basket_sets2, min_support=0.05, use_colnames=True)
rules4 = association_rules(frequent_itemsets2, metric="lift", min_threshold=1)

rules4[ (rules4['lift'] >= 4) &
        (rules4['confidence'] >= 0.5)]

In [None]:
## NOW LOOK AT Portugal

basket2 = (df[df['Country'] =="Portugal"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

basket_sets2 = basket2.applymap(encode_units)
basket_sets2.drop('POSTAGE', inplace=True, axis=1)
frequent_itemsets2 = apriori(basket_sets2, min_support=0.05, use_colnames=True)
rules3 = association_rules(frequent_itemsets2, metric="lift", min_threshold=1)

rules3[ (rules3['lift'] >= 8) &
        (rules3['confidence'] >= 0.8)]


In [None]:
## Playing with the DATA 

## NOW LOOK AT GERMANY

basket2 = (df[df['Country'] =="Germany"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

basket_sets2 = basket2.applymap(encode_units)
basket_sets2.drop('POSTAGE', inplace=True, axis=1)
frequent_itemsets2 = apriori(basket_sets2, min_support=0.05, use_colnames=True)
rules4 = association_rules(frequent_itemsets2, metric="lift", min_threshold=1)

rules4[ (rules4['lift'] >= 4) &
        (rules4['confidence'] >= 0.5)]

