In [14]:
import pandas as pd

df = pd.read_csv('fashion_retail_sales_cleaned.csv', parse_dates=['purchase_date'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3400 entries, 0 to 3399
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   customer_id      3400 non-null   int64         
 1   item_purchased   3400 non-null   object        
 2   purchase_amount  3400 non-null   float64       
 3   purchase_date    3400 non-null   datetime64[ns]
 4   review_rating    3400 non-null   float64       
 5   payment_method   3400 non-null   object        
 6   month            3400 non-null   int64         
 7   year             3400 non-null   int64         
 8   day_of_week      3400 non-null   object        
dtypes: datetime64[ns](1), float64(2), int64(3), object(3)
memory usage: 239.2+ KB


In [15]:
# group data into baskets
# one can assume that items bought by the same customer_id on the same purchase_date form one basket.

basket_df = df.groupby(['customer_id', 'purchase_date'])['item_purchased'].apply(list).reset_index()

basket_df['basket_size'] = basket_df['item_purchased'].apply(len)
print(basket_df['basket_size'].describe())

count    3301.000000
mean        1.029991
std         0.172355
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         3.000000
Name: basket_size, dtype: float64


In [16]:
# One-Hot Encode the data. You'll need to transform your list of baskets into a matrix where each row is a transaction, each column is an item, and the values are 1 (if the item is in the transaction) or 0.
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import fpgrowth

te = TransactionEncoder()
te_ary = te.fit_transform(
    basket_df['item_purchased'])#.transform(df['item_purchased'])
df_basket = pd.DataFrame(te_ary, columns=te.columns_)

In [17]:
# Use the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df_basket, min_support=0.005, use_colnames=True)

# Use the FP-Growth algorithm to find frequent itemsets
frequent_itemsets_fp = fpgrowth(
    df_basket, min_support=0.005, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Generate association rules using FP-Growth
rules_fp = association_rules(frequent_itemsets_fp, metric="lift", min_threshold=1)

rules = rules.sort_values(by='confidence', ascending=False)
rules_fp = rules_fp.sort_values(by='confidence', ascending=False)

rules = rules[rules['antecedents'].apply(lambda x: len(x)) > 0]
rules = rules[rules['consequents'].apply(lambda x: len(x)) > 0]
rules_fp = rules_fp[rules_fp['antecedents'].apply(lambda x: len(x)) > 0]
rules_fp = rules_fp[rules_fp['consequents'].apply(lambda x: len(x)) > 0]

# Display the rules
print("Apriori Rules:")
print(rules)
print("\nFP-Growth Rules:")
print(rules_fp)

Apriori Rules:
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, representativity, leverage, conviction, zhangs_metric, jaccard, certainty, kulczynski]
Index: []

FP-Growth Rules:
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, representativity, leverage, conviction, zhangs_metric, jaccard, certainty, kulczynski]
Index: []


In [18]:
from collections import Counter
from itertools import chain

item_counts = Counter(chain.from_iterable(basket_df['item_purchased']))
print(item_counts.most_common(10))

[('Belt', 90), ('Skirt', 88), ('Shorts', 87), ('Pants', 86), ('Tank Top', 82), ('T-shirt', 82), ('Pajamas', 81), ('Camisole', 76), ('Loafers', 76), ('Hoodie', 75)]


In [None]:
# Analizza se lo stesso cliente ha acquistato diversi prodotti in giorni diversi. È meno forte di una regola "nello stesso carrello", ma può darti insight:

from itertools import combinations, chain
from collections import Counter
customer_items = df.groupby('customer_id')['item_purchased'].apply(list)
customer_items = customer_items[customer_items.apply(len) > 1]


pairs = chain.from_iterable([combinations(set(items), 2)
                            for items in customer_items])
pair_counts = Counter(pairs)
print(pair_counts.most_common(10))

[(('Belt', 'Skirt'), 33), (('Camisole', 'Skirt'), 31), (('Belt', 'Trench Coat'), 30), (('Socks', 'Tank Top'), 29), (('Sneakers', 'Skirt'), 29), (('Camisole', 'Loafers'), 29), (('Tunic', 'Pants'), 28), (('Pajamas', 'Skirt'), 28), (('Camisole', 'Sweater'), 28), (('Belt', 'Poncho'), 28)]
