In [1]:
# import packages
import pandas as pd
import numpy as np
import psycopg2
from psycopg2 import Error
import matplotlib.pyplot as plt

In [2]:
# Global Variables
trnsact_file_name = 'trnsact.csv'
clusters3_file = 'clustering_result3.csv'
clusters10_file = 'clustering_result10.csv'

In [3]:
TRNSACT = pd.read_csv(trnsact_file_name, sep=',', header= None,
                       dtype={'SKU':'int', 'STORE':'int', 'REGISTER':'int', 'TRANNUM':'int', 'SEQ':'str', 
                           'SALEDATE':'str', 'STYPE':'str', 'QUANTITY':'int', 'ORGPRICE':'float',
                           'SPRICE':'float', 'AMT':'float', 'INTERID':'int', 'MIC':'int', 'Unknown':'int'},
                       names=['SKU', 'STORE', 'REGISTER', 'TRANNUM', 'INTERID', 
                           'SALEDATE', 'STYPE', 'QUANTITY', 'ORGPRICE',
                           'SPRICE', 'AMT', 'SEQ', 'MIC', 'Unknown']).drop('Unknown',axis=1)

In [4]:
clusters3 = pd.read_csv(clusters3_file)

In [5]:
TRNSACT.head()

Unnamed: 0,SKU,STORE,REGISTER,TRANNUM,INTERID,SALEDATE,STYPE,QUANTITY,ORGPRICE,SPRICE,AMT,SEQ,MIC
0,3,202,290,1100,326708721,2005-01-18,P,1,0.0,30.0,30.0,3500000,818
1,3,202,540,2700,326708721,2005-01-29,R,1,0.0,30.0,30.0,15200000,818
2,3,303,500,2100,23702074,2004-08-18,P,1,0.0,12.0,12.0,4600000,48
3,3,709,360,500,0,2005-08-14,P,1,0.0,30.0,30.0,6500000,818
4,3,802,660,400,0,2005-08-09,P,1,440.0,30.0,30.0,4700000,599


In [6]:
merged = pd.merge(TRNSACT, clusters3[['SKU','cluster']], on='SKU', how='inner')
merged.head()

Unnamed: 0,SKU,STORE,REGISTER,TRANNUM,INTERID,SALEDATE,STYPE,QUANTITY,ORGPRICE,SPRICE,AMT,SEQ,MIC,cluster
0,69,8002,330,1900,279608907,2004-09-30,P,1,12.25,7.38,7.38,60500000,2,2
1,69,8002,330,3800,821309037,2005-03-03,P,1,12.25,14.75,14.75,969200003,50,2
2,69,8002,350,2200,786008907,2004-09-07,P,1,12.25,12.25,12.25,24500000,804,2
3,69,8002,390,4400,214600533,2004-09-03,P,1,12.25,14.75,14.75,35800000,862,2
4,73,5402,120,1000,0,2005-05-13,P,1,12.0,3.0,3.0,707500004,443,2


In [7]:
!pip install mlxtend



In [17]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Assuming 'df' is your pandas DataFrame with 'ordernumber' and 'itemnumber' columns
# Transforming the DataFrame to have one transaction per row
cluster1_trnsact = merged.loc[merged['cluster'] == 0,['TRANNUM','SKU']].groupby('TRANNUM')['SKU'].apply(list).tolist()

# Encoding the transactions
te = TransactionEncoder()
te_ary = te.fit(cluster1_trnsact).transform(cluster1_trnsact)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Running Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.089, use_colnames=True)

# Generating association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

# Displaying the rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(8592054),(8912054),0.100408,0.105306,0.089796,0.894309,8.492469,0.079222,8.465181,0.980721
1,(8912054),(8592054),0.105306,0.100408,0.089796,0.852713,8.492469,0.079222,6.107755,0.98609
2,(8912054),(8832054),0.105306,0.103673,0.090612,0.860465,8.299762,0.079695,6.423673,0.983034
3,(8832054),(8912054),0.103673,0.105306,0.090612,0.874016,8.299762,0.079695,7.101633,0.981244
4,(9162054),(8832054),0.103673,0.103673,0.089796,0.866142,8.354517,0.079048,6.696086,0.982125
5,(8832054),(9162054),0.103673,0.103673,0.089796,0.866142,8.354517,0.079048,6.696086,0.982125


In [19]:
cluster2_trnsact = merged.loc[merged['cluster'] == 1,['TRANNUM','SKU']].groupby('TRANNUM')['SKU'].apply(list).tolist()

# Encoding the transactions
te = TransactionEncoder()
te_ary = te.fit(cluster2_trnsact).transform(cluster2_trnsact)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Running Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.089, use_colnames=True)

# Generating association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

# Displaying the rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(3029412),(2509639),0.162524,0.133285,0.090156,0.554723,4.161941,0.068494,1.946462,0.907164
1,(2509639),(3029412),0.133285,0.162524,0.090156,0.676417,4.161941,0.068494,2.588131,0.876559
2,(4258931),(3029412),0.134259,0.162524,0.089669,0.667877,4.109394,0.067848,2.52158,0.873997
3,(3029412),(4258931),0.162524,0.134259,0.089669,0.551724,4.109394,0.067848,1.931268,0.903495
4,(6032521),(3029412),0.140838,0.162524,0.095517,0.678201,4.172917,0.072627,2.602478,0.885002
5,(3029412),(6032521),0.162524,0.140838,0.095517,0.587706,4.172917,0.072627,2.083858,0.907918
6,(6062521),(3029412),0.135721,0.162524,0.093324,0.687612,4.230825,0.071266,2.680885,0.883557
7,(3029412),(6062521),0.162524,0.135721,0.093324,0.574213,4.230825,0.071266,2.029838,0.911835
8,(6200129),(3029412),0.141082,0.162524,0.098197,0.696028,4.282605,0.075268,2.755104,0.892399
9,(3029412),(6200129),0.162524,0.141082,0.098197,0.604198,4.282605,0.075268,2.17007,0.915247


In [20]:
cluster3_trnsact = merged.loc[merged['cluster'] == 2,['TRANNUM','SKU']].groupby('TRANNUM')['SKU'].apply(list).tolist()

# Encoding the transactions
te = TransactionEncoder()
te_ary = te.fit(cluster3_trnsact).transform(cluster3_trnsact)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Running Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.089, use_colnames=True)

# Generating association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

# Displaying the rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
