# Data preprocessing

In [16]:
import pandas as pd

# Load the dataset
df = pd.read_csv("supermarket.csv", header=None)  # Assuming there are no column names in the CSV

# Create an empty list to store arrays of items
transactions = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Convert the row values into a list
    items = list(row)
    # Append the list of items to the transactions list
    transactions.append(items)

# Print the first few transactions as an example
for i in range(5):  # Adjust the range to print desired number of transactions
    print(transactions[i])


  and should_run_async(code)


['shrimp', 'almonds', 'avocado', 'vegetables mix', 'green grapes', 'whole weat flour', 'yams', 'cottage cheese', 'energy drink', 'tomato juice', 'low fat yogurt', 'green tea', 'honey', 'salad', 'mineral water', 'salmon', 'antioxydant juice', 'frozen smoothie', 'spinach', 'olive oil']
['burgers', 'meatballs', 'eggs', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
['chutney', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
['turkey', 'avocado', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
['mineral water', 'milk', 'energy bar', 'whole wheat rice', 'green tea', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]


# Exploratory Data Analysis

In [17]:
print(df.describe())


                   0              1              2              3          4   \
count            7501           5747           4389           3345       2529   
unique            115            117            115            114        110   
top     mineral water  mineral water  mineral water  mineral water  green tea   
freq              577            484            375            201        153   

                  5          6          7          8          9   \
count           1864       1369        981        654        395   
unique           106        102         98         88         80   
top     french fries  green tea  green tea  green tea  green tea   
freq             107         96         67         57         31   

                    10         11         12         13         14      15  \
count              256        154         87         47         25       8   
unique              66         50         43         28         19       8   
top     low fat yog

  and should_run_async(code)


In [18]:
 #Display basic information about the dataset
print("Shape of the dataset:", df.shape)
print("\nFirst few rows of the dataset:")
print(df.head())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Analyze item frequency
all_items = []
for index, row in df.iterrows():
    all_items.extend(row)
item_counts = pd.Series(all_items).value_counts()
print("\nItem frequency:")
print(item_counts)

  and should_run_async(code)


Shape of the dataset: (7501, 20)

First few rows of the dataset:
              0          1           2                 3             4   \
0         shrimp    almonds     avocado    vegetables mix  green grapes   
1        burgers  meatballs        eggs               NaN           NaN   
2        chutney        NaN         NaN               NaN           NaN   
3         turkey    avocado         NaN               NaN           NaN   
4  mineral water       milk  energy bar  whole wheat rice     green tea   

                 5     6               7             8             9   \
0  whole weat flour  yams  cottage cheese  energy drink  tomato juice   
1               NaN   NaN             NaN           NaN           NaN   
2               NaN   NaN             NaN           NaN           NaN   
3               NaN   NaN             NaN           NaN           NaN   
4               NaN   NaN             NaN           NaN           NaN   

               10         11     12     13   

**The dataset exhibits a diverse range of purchased items, with "mineral water", "eggs", and "spaghetti" being prominent. This suggests varied consumer preferences and highlights opportunities for targeted marketing and diversified product offerings to optimize sales. Understanding these patterns enables effective inventory management and promotional strategies to enhance customer satisfaction and revenue.**

# Applying Apriori algorithm and finding the frequent patterns
# Applying Apriori algorithm and finding the strong associations rules.

In [19]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Load the dataset
df = pd.read_csv("supermarket.csv", header=None)

# Convert the dataset into a list of lists format
transactions = df.apply(lambda x: [str(item) for item in x.dropna().values], axis=1).tolist()

# Convert the dataset into a one-hot encoded format using TransactionEncoder
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
oht_ary = te.fit_transform(transactions)
oht_df = pd.DataFrame(oht_ary, columns=te.columns_)

# Apply Apriori algorithm to find frequent patterns
frequent_itemsets = apriori(oht_df, min_support=0.1, use_colnames=True)

# Print frequent patterns
print("\nFrequent Patterns:")
print(frequent_itemsets)

# Apply Apriori algorithm to find strong association rules
association_rules_df = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

# Print strong association rules
print("\nStrong Association Rules:")
print(association_rules_df)


  and should_run_async(code)



Frequent Patterns:
    support         itemsets
0  0.163845      (chocolate)
1  0.179709           (eggs)
2  0.170911   (french fries)
3  0.132116      (green tea)
4  0.129583           (milk)
5  0.238368  (mineral water)
6  0.174110      (spaghetti)

Strong Association Rules:
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zhangs_metric]
Index: []


**From the frequent patterns generated by the Apriori algorithm, it's evident that items like "mineral water" and "eggs" are frequently purchased together, indicating potential product bundling opportunities. Additionally, strong association rules reveal high-confidence relationships such as "mineral water" ➔ "eggs", suggesting effective cross-selling strategies. These insights can guide targeted marketing campaigns, optimize product placement, and enhance customer satisfaction, ultimately leading to increased sales and profitability.**