In [None]:
# https://www.kaggle.com/code/ozlemilgun/market-basket-analysis-with-apriori-algorithm
# https://www.facebook.com/tautologyai/videos/303900072700235
# https://www.kaggle.com/code/ekrembayar/apriori-association-rules-grocery-store
# https://www.youtube.com/watch?v=RDQplhHYUr0

In [None]:
import warnings
warnings.simplefilter("ignore")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import re

In [None]:
from itertools import combinations

# 1. Load data

In [None]:
df = pd.read_excel("online_retail_II.xlsx", sheet_name='Year 2009-2010')

In [None]:
df.shape

# 2. Check data quality

In [None]:
# data type

In [None]:
df.info()

In [None]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [None]:
# columns name

In [None]:
df.columns

In [None]:
# select only useful columns

In [None]:
df = df[['Invoice','StockCode','Description','InvoiceDate']]

In [None]:
# data type

In [None]:
df.info()

In [None]:
# check missing value

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()

In [None]:
# check white space

In [None]:
def empty_or_whitespace_strings(df):
    string_columns = df.select_dtypes(include=['object']).columns
    empty_whitespace_check = df[string_columns].applymap(lambda x: isinstance(x, str) and x.isspace())

    count_empty_whitespace = empty_whitespace_check.sum()
    rows_with_whitespace_indices = empty_whitespace_check.any(axis=1)
    list_row_with_whilespace_indices = df.index[rows_with_whitespace_indices].tolist()

    return count_empty_whitespace, list_row_with_whilespace_indices

In [None]:
show_whilespace, indices_whilesapce = empty_or_whitespace_strings(df)

In [None]:
show_whilespace

In [None]:
# check unique value

In [None]:
for i in df.columns:
    print('Columns name: ', i)
    print('Unique value: ', df[i].unique())
    print('Count unique value: ', df[i].nunique())
    print('-'*10)

In [None]:
# check Description

In [None]:
description_counts = df.groupby('StockCode').agg(number_description=('Description', pd.Series.nunique)).reset_index()

In [None]:
multiple_descriptions = description_counts[description_counts['number_description'] > 1]

In [None]:
multiple_descriptions = multiple_descriptions.sort_values("number_description", ascending=False)

In [None]:
multiple_descriptions

In [None]:
df[df['StockCode'] == 'DCGSSGIRL']

In [None]:
df = df[~df['StockCode'].isin(multiple_descriptions['StockCode'])]

In [None]:
# check StockCode

In [None]:
stockcode_counts = df.groupby('Description').agg(number_stockcode=('StockCode', pd.Series.nunique)).reset_index()

In [None]:
multiple_stockcode = stockcode_counts[stockcode_counts['number_stockcode'] > 1]

In [None]:
multiple_stockcode = multiple_stockcode.sort_values("number_stockcode", ascending=False)

In [None]:
multiple_stockcode

In [None]:
df[df['Description'] == 'damages']

In [None]:
df = df[~df['Description'].isin(multiple_stockcode['Description'])]

In [None]:
# check unique value (again)

In [None]:
for i in df.columns:
    print('Columns name: ', i)
    print('Unique value: ', df[i].unique())
    print('Count unique value: ', df[i].nunique())
    print('-'*10)

In [None]:
# clean Description

In [None]:
df['Description'] = df['Description'].str.replace(',', '')

In [None]:
df['Description'] = df['Description'].str.replace('*', '')

In [None]:
df['Description'] = df['Description'].str.strip()

In [None]:
for n in range(2,11,1):
    df['Description'] = df['Description'].str.replace(' '*n, '')

In [None]:
# manual clean Description

In [None]:
df_check_description = df[['Description']].drop_duplicates()

In [None]:
def char_count(row):
    return len(set(str(row)))

In [None]:
def lowercase_count(row):
    lowercase_letters = set(char for word in row.split() for char in word if char.islower())
    return len(lowercase_letters)

In [None]:
df_check_description['Description_char_count'] = df_check_description['Description'].apply(char_count)

In [None]:
df_check_description['Description_lower_char_count'] = df_check_description['Description'].apply(lowercase_count)

In [None]:
df_check_description.to_excel('check_description.xlsx', index = False)

In [None]:
# given away
# Adjust bad debt
# Discount
# damaged
# rex use
# lost
# ?
# SAMPLES
# faulty
# Dotcom sales

In [None]:
df = df[~df['Description'].isin(['given away','Adjust bad debt','Discount','damaged','rex use','lost','?','SAMPLES','faulty','Dotcom sales'])]

In [None]:
# export unique item name

In [None]:
df[['Description']].drop_duplicates().to_excel('item_name_final.xlsx')

# 3. Exploratory data analysis (EDA)

In [None]:
df.sample(5)

In [None]:
# Top 100 Count of Transactions by Description

In [None]:
description_counts = df['Description'].value_counts().head(100)
description_counts = description_counts.sort_values(ascending=False)

In [None]:
plt.figure(figsize=(14, 8))
sns.barplot(x=description_counts.index, y=description_counts.values, palette='viridis')
plt.title('Top 100 Count of Transactions by Description')
plt.xlabel('Description')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Top 100 Count of Transactions by Invoice

In [None]:
invoice_counts = df['Invoice'].value_counts().head(100)
invoice_counts = invoice_counts.sort_values(ascending=False)

In [None]:
plt.figure(figsize=(14, 8))
sns.barplot(x=invoice_counts.index, y=invoice_counts.values, palette='viridis')
plt.title('Top 100 Count of Transactions by Invoice')
plt.xlabel('Invoice')
plt.ylabel('Count')
plt.xticks(rotation=90) 
plt.show()

In [None]:
# Count of Transactions by Day of Week and Month of Year

In [None]:
df['DayOfWeek'] = df['InvoiceDate'].dt.day_name()
df['MonthOfYear'] = df['InvoiceDate'].dt.month

In [None]:
transaction_counts = df.groupby(['MonthOfYear', 'DayOfWeek']).size().unstack(fill_value=0)

In [None]:
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
transaction_counts = transaction_counts.reindex(columns=days_order, fill_value=0)

In [None]:
month_names = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 
               7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
transaction_counts.index = transaction_counts.index.map(month_names)

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(transaction_counts, cmap="YlGnBu", annot=True, fmt="d")
plt.title('Count of Transactions by Day of Week and Month of Year')
plt.xlabel('Day of Week')
plt.ylabel('Month of Year')
plt.show()

# 4. Model

In [None]:
# set list

In [None]:
# txn_list = [['I1','I2','I5'],
#             ['I2','I4'],
#             ['I2','I3'],
#             ['I1','I2','I4'],
#             ['I1','I3'],
#             ['I2','I3'],
#             ['I1','I3'],
#             ['I1','I2','I3','I5'],
#             ['I1','I2','I3']]
# item_list = ['I1','I2','I3','I4','I5']
# max_no_item_invoice = 4

In [None]:
txn_list = df.groupby('Invoice')['Description'].apply(list).tolist()
item_list = df['Description'].unique().tolist()
max_no_item_invoice = df.groupby('Invoice')['Description'].nunique().reset_index()['Description'].max()

In [None]:
# first step (support)

In [None]:
min_support = 0.005

In [None]:
support_result_list = []

In [None]:
for round in range(1,max_no_item_invoice+1,1):
    if round == 1:
        print('size of list:', round)
        item_set_list = list(combinations(item_list, round))
    else:
        print('size of list:', round)
        item_list = sorted(list(set([item for sublist in item_set_list for item in sublist])))
        item_set_list = list(combinations(item_list, round))

    support_item_set_list = []
    initial_denominator = len(txn_list)
    initial_support = 0
    for item in item_set_list:
        initial_nominator = 0
        for txn in txn_list:
            if set(item).issubset(set(txn)) == True:
                initial_nominator += 1
        initial_support = initial_nominator/initial_denominator
        support_item_set_list.append(initial_support)

    index_to_drop_list = []
    for index, (item, support) in enumerate(zip(item_set_list, support_item_set_list)):
        if support < min_support:
            index_to_drop_list.append(index)

    item_set_list = [item for index, item in enumerate(item_set_list) if index not in index_to_drop_list]
    support_item_set_list = [item for index, item in enumerate(support_item_set_list) if index not in index_to_drop_list]

    for item, support in zip(item_set_list, support_item_set_list):
        result = {
        "support": support,
        "itemsets": item
        }
        support_result_list.append(result)

In [None]:
support_result = pd.DataFrame(support_result_list)

In [None]:
support_result

In [None]:
# second step (confidence)

In [None]:
min_confidence = 0.5

In [None]:
rules_list = []
confidence_result_list = []

In [None]:
for index, (item, support) in enumerate(zip(support_result['itemsets'], support_result['support'])):
    if len(item) == 1:
        pass
    else:
        n = len(item)
        for r in range(1, n):
            item_combinations = combinations(item, r)
            for item_set in item_combinations:
                remaining_items = tuple(x for x in item if x not in item_set)
                rule = (item_set, remaining_items)
                rules_list.append(rule)

In [None]:
for rule in rules_list:
    tran_count_union = rule[0] + rule[1]
    tran_count = rule[0]
    initial_nominator = 0
    initial_denominator = 0
    for txn in txn_list:
        if set(tran_count_union).issubset(set(txn)) == True:
            initial_nominator += 1
        if set(tran_count).issubset(set(txn)) == True:
            initial_denominator += 1

    if initial_denominator != 0 and initial_nominator / initial_denominator >= min_confidence:
        result = {
            "antecedents": rule[0],
            "consequents": rule[1],
            "confidence": initial_nominator / initial_denominator
        }
        confidence_result_list.append(result)

In [None]:
confidence_result = pd.DataFrame(confidence_result_list)

In [None]:
confidence_result

# 5. Export

In [None]:
confidence_result.to_excel('final_result.xlsx', index=False)