In [1]:
# Implementing the Apriori Algorithm 


# Step 1
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd
import numpy as np


In [7]:
# Step 2
data = pd.read_excel('D:/Edu/Internships/Remarkskill AI/Apriori Algorithm project/Online_Retail.xlsx')
print(data.head())

# Explore the Columns of the Data Set
print(data.columns)


  InvoiceNo StockCode                                lower  \
0    536365    85123A   white hanging heart t-light holder   
1    536365     71053                  white metal lantern   
2    536365    84406B       cream cupid hearts coat hanger   
3    536365    84029G  knitted union flag hot water bottle   
4    536365    84029E       red woolly hottie white heart.   

                           Description  Quantity         InvoiceDate  \
0   WHITE HANGING HEART T-LIGHT HOLDER         6 2010-12-01 08:26:00   
1                  WHITE METAL LANTERN         6 2010-12-01 08:26:00   
2       CREAM CUPID HEARTS COAT HANGER         8 2010-12-01 08:26:00   
3  KNITTED UNION FLAG HOT WATER BOTTLE         6 2010-12-01 08:26:00   
4       RED WOOLLY HOTTIE WHITE HEART.         6 2010-12-01 08:26:00   

   UnitPrice  CustomerID         Country  
0       2.55     17850.0  United Kingdom  
1       3.39     17850.0  United Kingdom  
2       2.75     17850.0  United Kingdom  
3       3.39     17850

In [8]:
# Step 3: Cleaning the Data
# Strip the extra spaces in description 
data['Description'] = data['Description'].str.strip()

# Drop the rows without any invoice number 
data.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
data['InvoiceNo'] = data['InvoiceNo'].astype('str')

# Drop all the transactions which were done on credit
data = data[~data['InvoiceNo'].str.contains('C')]


In [9]:
# Step 4: Splitting the Data According to the region of Transaction

# Transactions done in France
basket_France = (data[data['Country'] == "France"].groupby(['InvoiceNo','Description'])['Quantity'].sum().unstack().reset_index().
                fillna(0).set_index('InvoiceNo'))

# Transactions done in United Kingdom
basket_UK = (data[data['Country'] == "United Kingdom"].groupby(['InvoiceNo','Description'])['Quantity'].sum().unstack().reset_index().
                fillna(0).set_index('InvoiceNo'))

# Transactions done in Portugal
basket_Por = (data[data['Country'] == "Portugal"].groupby(['InvoiceNo','Description'])['Quantity'].sum().unstack().reset_index().
                fillna(0).set_index('InvoiceNo'))

# Transactions done in Sweden
basket_Sweden = (data[data['Country'] == "Sweden"].groupby(['InvoiceNo','Description'])['Quantity'].sum().unstack().reset_index().
                fillna(0).set_index('InvoiceNo'))


In [10]:
# Step 5: Encoding the Data
# Defining the hot encoding function to make the data suitable for concerned libraries data processing

def hot_encode(x):
    if(x<=0):
        return 0
    if(x>=1):
        return 1
    
# Encoding the Datasets
basket_encoded = basket_France.applymap(hot_encode)
basket_France = basket_encoded

basket_encoded = basket_UK.applymap(hot_encode)
basket_UK = basket_encoded

basket_encoded = basket_Por.applymap(hot_encode)
basket_Por = basket_encoded

basket_encoded = basket_Sweden.applymap(hot_encode)
basket_Sweden = basket_encoded


In [11]:
# Step 6: Building a model and analyzing the results

frq_items = apriori(basket_France, min_support=0.05,use_colnames=True)
# Collecting the inferred rules in a DataFrame
rules = association_rules(frq_items, metric='lift',min_threshold=1)
rules = rules.sort_values(['confidence','lift'],ascending=[False,False])
print(rules.head())

frq_items = apriori(basket_UK, min_support=0.05,use_colnames=True)
# Collecting the inferred rules in a DataFrame
rules = association_rules(frq_items, metric='lift',min_threshold=1)
rules = rules.sort_values(['confidence','lift'],ascending=[False,False])
print(rules.head())

frq_items = apriori(basket_Por, min_support=0.05,use_colnames=True)
# Collecting the inferred rules in a DataFrame
rules = association_rules(frq_items, metric='lift',min_threshold=1)
rules = rules.sort_values(['confidence','lift'],ascending=[False,False])
print(rules.head())

frq_items = apriori(basket_Sweden, min_support=0.05,use_colnames=True)
# Collecting the inferred rules in a DataFrame
rules = association_rules(frq_items, metric='lift',min_threshold=1)
rules = rules.sort_values(['confidence','lift'],ascending=[False,False])
print(rules.head())




                                           antecedents  \
44                        (JUMBO BAG WOODLAND ANIMALS)   
258  (RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...   
270  (PLASTERS IN TIN WOODLAND ANIMALS, RED TOADSTO...   
302  (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...   
301  (SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...   

                         consequents  antecedent support  consequent support  \
44                         (POSTAGE)            0.076531            0.765306   
258                        (POSTAGE)            0.051020            0.765306   
270                        (POSTAGE)            0.053571            0.765306   
302  (SET/6 RED SPOTTY PAPER PLATES)            0.102041            0.127551   
301    (SET/6 RED SPOTTY PAPER CUPS)            0.102041            0.137755   

      support  confidence      lift  leverage  conviction  
44   0.076531       1.000  1.306667  0.017961         inf  
258  0.051020       1.000  1.306667  0.011974     



                              antecedents                         consequents  \
1170     (SET 12 COLOUR PENCILS SPACEBOY)  (SET 12 COLOUR PENCILS DOLLY GIRL)   
1171   (SET 12 COLOUR PENCILS DOLLY GIRL)    (SET 12 COLOUR PENCILS SPACEBOY)   
1172   (SET OF 4 KNICK KNACK TINS LONDON)  (SET 12 COLOUR PENCILS DOLLY GIRL)   
1173   (SET 12 COLOUR PENCILS DOLLY GIRL)  (SET OF 4 KNICK KNACK TINS LONDON)   
1174  (SET OF 4 KNICK KNACK TINS POPPIES)  (SET 12 COLOUR PENCILS DOLLY GIRL)   

      antecedent support  consequent support   support  confidence       lift  \
1170            0.051724            0.051724  0.051724         1.0  19.333333   
1171            0.051724            0.051724  0.051724         1.0  19.333333   
1172            0.051724            0.051724  0.051724         1.0  19.333333   
1173            0.051724            0.051724  0.051724         1.0  19.333333   
1174            0.051724            0.051724  0.051724         1.0  19.333333   

      leverage  conviction



                           antecedents                     consequents  \
0        (12 PENCILS SMALL TUBE SKULL)   (PACK OF 72 SKULL CAKE CASES)   
1        (PACK OF 72 SKULL CAKE CASES)   (12 PENCILS SMALL TUBE SKULL)   
4       (ASSORTED BOTTLE TOP  MAGNETS)         (36 DOILIES DOLLY GIRL)   
5              (36 DOILIES DOLLY GIRL)  (ASSORTED BOTTLE TOP  MAGNETS)   
180  (CHILDRENS CUTLERY CIRCUS PARADE)  (CHILDRENS CUTLERY DOLLY GIRL)   

     antecedent support  consequent support   support  confidence  lift  \
0              0.055556            0.055556  0.055556         1.0  18.0   
1              0.055556            0.055556  0.055556         1.0  18.0   
4              0.055556            0.055556  0.055556         1.0  18.0   
5              0.055556            0.055556  0.055556         1.0  18.0   
180            0.055556            0.055556  0.055556         1.0  18.0   

     leverage  conviction  
0    0.052469         inf  
1    0.052469         inf  
4    0.052469       

In [None]:
# installation of mlxtend in anaconda prompt
# conda install -c conda-forge mlxtend