In [1]:
# import packages
import pandas as pd
import numpy as np
import psycopg2
from psycopg2 import Error
import matplotlib.pyplot as plt

In [2]:
# Global Variables
trnsact_file_name = 'trnsact.csv'
clusters3_file = 'clustering_result3.csv'
clusters10_file = 'clustering_result10.csv'

In [3]:
TRNSACT = pd.read_csv(trnsact_file_name, sep=',', header= None,
                       dtype={'SKU':'int', 'STORE':'int', 'REGISTER':'int', 'TRANNUM':'int', 'SEQ':'str', 
                           'SALEDATE':'str', 'STYPE':'str', 'QUANTITY':'int', 'ORGPRICE':'float',
                           'SPRICE':'float', 'AMT':'float', 'INTERID':'int', 'MIC':'int', 'Unknown':'int'},
                       names=['SKU', 'STORE', 'REGISTER', 'TRANNUM', 'INTERID', 
                           'SALEDATE', 'STYPE', 'QUANTITY', 'ORGPRICE',
                           'SPRICE', 'AMT', 'SEQ', 'MIC', 'Unknown']).drop('Unknown',axis=1)

In [4]:
clusters3 = pd.read_csv(clusters3_file)

In [5]:
TRNSACT.head()

Unnamed: 0,SKU,STORE,REGISTER,TRANNUM,INTERID,SALEDATE,STYPE,QUANTITY,ORGPRICE,SPRICE,AMT,SEQ,MIC
0,3,202,290,1100,326708721,2005-01-18,P,1,0.0,30.0,30.0,3500000,818
1,3,202,540,2700,326708721,2005-01-29,R,1,0.0,30.0,30.0,15200000,818
2,3,303,500,2100,23702074,2004-08-18,P,1,0.0,12.0,12.0,4600000,48
3,3,709,360,500,0,2005-08-14,P,1,0.0,30.0,30.0,6500000,818
4,3,802,660,400,0,2005-08-09,P,1,440.0,30.0,30.0,4700000,599


In [6]:
merged = pd.merge(TRNSACT, clusters3[['SKU','cluster']], on='SKU', how='inner')
merged.head()

Unnamed: 0,SKU,STORE,REGISTER,TRANNUM,INTERID,SALEDATE,STYPE,QUANTITY,ORGPRICE,SPRICE,AMT,SEQ,MIC,cluster
0,69,8002,330,1900,279608907,2004-09-30,P,1,12.25,7.38,7.38,60500000,2,0
1,69,8002,330,3800,821309037,2005-03-03,P,1,12.25,14.75,14.75,969200003,50,0
2,69,8002,350,2200,786008907,2004-09-07,P,1,12.25,12.25,12.25,24500000,804,0
3,69,8002,390,4400,214600533,2004-09-03,P,1,12.25,14.75,14.75,35800000,862,0
4,73,5402,120,1000,0,2005-05-13,P,1,12.0,3.0,3.0,707500004,443,0


In [7]:
!pip install mlxtend



In [8]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Assuming 'df' is your pandas DataFrame with 'ordernumber' and 'itemnumber' columns
# Transforming the DataFrame to have one transaction per row
cluster1_trnsact = merged.loc[merged['cluster'] == 0,['TRANNUM','SKU']].groupby('TRANNUM')['SKU'].apply(list).tolist()

# Encoding the transactions
te = TransactionEncoder()
te_ary = te.fit(cluster1_trnsact).transform(cluster1_trnsact)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Running Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.1, use_colnames=True)

# Generating association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

# Displaying the rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(6200129),(3029412),0.146101,0.168307,0.101691,0.696028,4.135469,0.077101,2.736081,0.887915
1,(3029412),(6200129),0.168307,0.146101,0.101691,0.604198,4.135469,0.077101,2.157388,0.911622
2,(6032521),(6062521),0.145849,0.14055,0.112036,0.768166,5.465426,0.091537,3.707179,0.956543
3,(6062521),(6032521),0.14055,0.145849,0.112036,0.797127,5.465426,0.091537,4.210284,0.950645


In [9]:
cluster2_trnsact = merged.loc[merged['cluster'] == 1,['TRANNUM','SKU']].groupby('TRANNUM')['SKU'].apply(list).tolist()

# Encoding the transactions
te = TransactionEncoder()
te_ary = te.fit(cluster2_trnsact).transform(cluster2_trnsact)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Running Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.1, use_colnames=True)

# Generating association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

# Displaying the rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(1297499),(858116),0.160393,0.14689,0.105155,0.655612,4.463277,0.081595,2.477178,0.924181
1,(858116),(1297499),0.14689,0.160393,0.105155,0.715877,4.463277,0.081595,2.955088,0.909554
2,(858116),(7977157),0.14689,0.131751,0.100245,0.682451,5.179847,0.080893,2.734222,0.945886
3,(7977157),(858116),0.131751,0.14689,0.100245,0.76087,5.179847,0.080893,3.567549,0.929393
4,(1297499),(7977157),0.160393,0.131751,0.109247,0.681122,5.169762,0.088115,2.722828,0.960649
5,(7977157),(1297499),0.131751,0.160393,0.109247,0.829193,5.169762,0.088115,4.915519,0.928959


In [10]:
cluster3_trnsact = merged.loc[merged['cluster'] == 2,['TRANNUM','SKU']].groupby('TRANNUM')['SKU'].apply(list).tolist()

# Encoding the transactions
te = TransactionEncoder()
te_ary = te.fit(cluster3_trnsact).transform(cluster3_trnsact)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Running Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.1, use_colnames=True)

# Generating association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

# Displaying the rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
