### Data Preprocessing:

In [1]:
import pandas as pd

In [2]:
# Load the dataset
df = pd.read_excel('Online_retail.xlsx')
df

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt
...,...
7495,"butter,light mayo,fresh bread"
7496,"burgers,frozen vegetables,eggs,french fries,ma..."
7497,chicken
7498,"escalope,green tea"


In [3]:
# Display basic info about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 1 columns):
 #   Column                                                                                                                                                                                                                           Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                           --------------  ----- 
 0   shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil  7500 non-null   object
dtypes: object(1)
memory usage: 58.7+ KB


In [4]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values

shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil    0
dtype: int64

In [5]:
# Remove duplicates
df.drop_duplicates(inplace=True)

In [6]:
# Split the items into separate columns
df['items'] = df['shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil'].str.split(',')

In [7]:
df

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil",items
0,"burgers,meatballs,eggs","[burgers, meatballs, eggs]"
1,chutney,[chutney]
2,"turkey,avocado","[turkey, avocado]"
3,"mineral water,milk,energy bar,whole wheat rice...","[mineral water, milk, energy bar, whole wheat ..."
4,low fat yogurt,[low fat yogurt]
...,...,...
7492,"burgers,salmon,pancakes,french fries,frozen sm...","[burgers, salmon, pancakes, french fries, froz..."
7493,"turkey,burgers,dessert wine,shrimp,pasta,tomat...","[turkey, burgers, dessert wine, shrimp, pasta,..."
7495,"butter,light mayo,fresh bread","[butter, light mayo, fresh bread]"
7496,"burgers,frozen vegetables,eggs,french fries,ma...","[burgers, frozen vegetables, eggs, french frie..."


In [8]:
# Convert the list of items into a binary matrix
from mlxtend.preprocessing import TransactionEncoder

In [9]:
te = TransactionEncoder()
te_ary = te.fit(df['items']).transform(df['items'])
df_binary = pd.DataFrame(te_ary, columns=te.columns_)

In [10]:
df_binary.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Association Rule Mining:

In [11]:
from mlxtend.frequent_patterns import apriori, association_rules

In [12]:
# Apply Apriori algorithm
frequent_itemsets = apriori(df_binary, min_support=0.01, use_colnames=True)

  and should_run_async(code)


In [13]:
# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

  and should_run_async(code)


In [14]:
rules.head()

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(almonds),(mineral water),0.029179,0.29971,0.010821,0.370861,1.237399,0.002076,1.113092,0.197619
1,(mineral water),(almonds),0.29971,0.029179,0.010821,0.036106,1.237399,0.002076,1.007186,0.273962
2,(avocado),(chocolate),0.045797,0.205217,0.010242,0.223629,1.089716,0.000843,1.023715,0.086281
3,(chocolate),(avocado),0.205217,0.045797,0.010242,0.049906,1.089716,0.000843,1.004325,0.103588
4,(avocado),(french fries),0.045797,0.192657,0.011594,0.253165,1.314069,0.002771,1.081019,0.250476


In [15]:
# Sort rules by lift
rules.sort_values('lift', ascending=False, inplace=True)

  and should_run_async(code)


In [16]:
# Display the top 10 rules
top_10_rules = rules.head(10)
top_10_rules

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
424,(whole wheat pasta),(olive oil),0.04058,0.087536,0.011014,0.271429,3.100757,0.007462,1.252401,0.706154
425,(olive oil),(whole wheat pasta),0.087536,0.04058,0.011014,0.125828,3.100757,0.007462,1.097519,0.742493
797,(soup),"(milk, mineral water)",0.070918,0.067826,0.012367,0.174387,2.571089,0.007557,1.129069,0.657703
792,"(milk, mineral water)",(soup),0.067826,0.070918,0.012367,0.182336,2.571089,0.007557,1.136264,0.655521
298,(herb & pepper),(ground beef),0.066473,0.135845,0.022802,0.343023,2.5251,0.013772,1.31535,0.646983
299,(ground beef),(herb & pepper),0.135845,0.066473,0.022802,0.167852,2.5251,0.013772,1.121828,0.698921
735,"(mineral water, shrimp)",(frozen vegetables),0.03343,0.129855,0.010435,0.312139,2.403747,0.006094,1.265001,0.604181
738,(frozen vegetables),"(mineral water, shrimp)",0.129855,0.03343,0.010435,0.080357,2.403747,0.006094,1.051028,0.671133
721,(ground beef),"(spaghetti, frozen vegetables)",0.135845,0.039034,0.01256,0.092461,2.368738,0.007258,1.05887,0.66867
716,"(spaghetti, frozen vegetables)",(ground beef),0.039034,0.135845,0.01256,0.321782,2.368738,0.007258,1.274155,0.601306


### Analysis and Interpretation:

1. Association between Whole Wheat Pasta and Olive Oil:

Support: 0.011014

Confidence: 0.271429

Lift: 3.100757

Interpretation: Customers who buy whole wheat pasta are likely to buy olive oil as well. This suggests that these two products are often purchased together, possibly for making pasta dishes or salads. The high lift value indicates a strong association between the two items.

2. Association between Soup and Milk, Mineral Water:

Support: 0.012367

Confidence: 0.174387

Lift: 2.571089

Interpretation: Customers purchasing soup are likely to buy milk and mineral water together. This association could be due to the common consumption pattern of having soup with a beverage. The relatively high lift indicates that this association is stronger than what would be expected by chance.

3. Association between Herb & Pepper and Ground Beef:

Support: 0.022802

Confidence: 0.343023

Lift: 2.525100

Interpretation: Customers who purchase herb & pepper are highly likely to buy ground beef as well. This association suggests that these items are commonly used together in cooking, such as seasoning ground beef with herbs and pepper.

4. Association between Cookies and Eggs:

Support: 0.012754

Confidence: 0.210191

Lift: 1.009971

Interpretation: There's a moderate association between cookies and eggs. While the lift is close to 1, indicating that the items are bought together as often as they are bought separately, the confidence suggests that there is still a tendency for customers who purchase cookies to also buy eggs.

5. Association between Salmon and Eggs:

Support: 0.011981

Confidence: 0.208754

Lift: 1.003067

Interpretation: There's a weak association between salmon and eggs. The lift is very close to 1, indicating that these items are bought together as frequently as they are bought separately. However, there is still a slight tendency for customers who buy salmon to also buy eggs.

Overall, these association rules provide insights into customer behavior and can be used for various purposes such as product placement, marketing strategies, or bundle promotions to encourage customers to purchase related items together.

### Interview Questions

What is lift and why is it important in Association rules?

Lift is a measure of how much more likely two items are to be bought together than if they were bought independently. It is important because it indicates the strength of an association between items. A lift value greater than 1 suggests a positive association, while a value less than 1 indicates a negative association.

What is support and Confidence? How do you calculate them?

Support is the proportion of transactions in the dataset that contain a particular itemset. It is calculated as:
Support(X)= Number of transactions containing X / Total number of transactions

Confidence is the proportion of transactions containing item X that also contain item Y. It is calculated as:
Confidence(X→Y)= Support(X∪Y) / Support(X)


What are some limitations or challenges of Association rules mining?

Scalability with large datasets.
Setting appropriate thresholds for support, confidence, and lift.
Interpreting a large number of generated rules.
Handling the sparsity of data in some cases.