In [67]:
# Import libraries
import pandas as pd

# Load the dataset
df = pd.read_csv("Megastore_Dataset_Task_3_3.csv")  # Update the filename if needed

# Standardize column names
df.columns = df.columns.str.strip().str.replace(" ", "").str.replace("\u00a0", "")

In [68]:
# Encode ordinal variables
order_priority_map = {'Low': 1, 'Medium': 2, 'High': 3}
satisfaction_map = {
    'Prefer not to answer': 0,
    'Dissatisfied': 1,
    'Very dissatisfied': 2,
    'Satisfied': 3,
    'Very Satisfied': 4
}
df['OrderPriority'] = df['OrderPriority'].map(order_priority_map)
df['CustomerOrderSatisfaction'] = df['CustomerOrderSatisfaction'].map(satisfaction_map)

# Encode nominal variables with one-hot encoding
df_encoded = pd.get_dummies(df, columns=['Region', 'Segment'], prefix=['Region', 'Segment'])

# Save the encoded dataset
df_encoded.to_csv("d599_task3_encoded_dataset.csv", index=False)

In [69]:
# Transactional dataset prep & creation
# Filter for Corporate customers in the Northeast
filtered_df = df[(df['Segment'] == 'Corporate') & (df['Region'] == 'Northeast')]

In [70]:
# Group by OrderID and ProductName, then pivot into basket format
basket_df = (
    filtered_df
    .groupby(['OrderID', 'ProductName'])['Quantity']
    .sum().unstack().reset_index().fillna(0)
    .set_index('OrderID')
)

In [71]:
# Convert quantities to binary (1 = purchased, 0 = not purchased)
basket_encoded = basket_df.map(lambda x: 1 if x > 0 else 0)

In [72]:
# Save the transactional basket
basket_encoded.to_csv("d599_task3_transactional_dataset.csv")

In [73]:
# View a selected portion of the rows to prove dataset contains 0s and 1s
basket_encoded.iloc[10:20, 10:20]

ProductName,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,16 PIECE CUTLERY SET PANTRY DESIGN,18PC WOODEN CUTLERY SET DISPOSABLE,20 DOLLY PEGS RETROSPOT,200 RED WHITE BENDY STRAWS,3 HOOK HANGER MAGIC GARDEN,3 PIECE SPACEBOY COOKIE CUTTER SET
OrderID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
541981,0,0,0,0,0,0,0,0,0,0
542433,0,0,0,0,0,0,0,0,0,1
542629,0,0,0,0,0,0,0,0,0,0
542735,0,0,0,0,0,0,0,0,0,0
542922,0,0,0,0,0,0,0,0,0,0
543030,0,0,0,0,0,0,0,0,0,0
544069,0,0,1,1,0,0,0,0,0,0
544115,1,0,0,0,0,0,0,0,0,0
544200,0,0,0,0,0,0,0,0,0,0
544585,0,0,0,0,0,0,0,0,0,0


In [74]:
# Import libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [75]:
# Load your transactional basket data
basket_encoded = pd.read_csv("d599_task3_transactional_dataset.csv", index_col=0)

In [76]:
# Changing type to bool, preferred by mlxtend

basket_encoded = basket_encoded.astype(bool)

In [77]:
# Check to see how many unique products, number of columns
print("Number of unique products:", basket_encoded.shape[1])

Number of unique products: 964


In [78]:
# Run Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(basket_encoded, min_support=0.05, use_colnames=True)

In [79]:
# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

In [80]:
# View the first few rules
print("✅ Apriori rules generated successfully!")
rules.head()

✅ Apriori rules generated successfully!


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(ALARM CLOCK BAKELIKE ORANGE),(ALARM CLOCK BAKELIKE GREEN),0.05042,0.10084,0.05042,1.0,9.916667,1.0,0.045336,inf,0.946903,0.5,1.0,0.75
1,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE ORANGE),0.10084,0.05042,0.05042,0.5,9.916667,1.0,0.045336,1.89916,1.0,0.5,0.473451,0.75
2,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE GREEN),0.084034,0.10084,0.067227,0.8,7.933333,1.0,0.058753,4.495798,0.954128,0.571429,0.77757,0.733333
3,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE PINK),0.10084,0.084034,0.067227,0.666667,7.933333,1.0,0.058753,2.747899,0.971963,0.571429,0.636086,0.733333
4,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN),0.084034,0.10084,0.07563,0.9,8.925,1.0,0.067156,8.991597,0.969419,0.692308,0.888785,0.825


In [81]:
# Display relevant columns from the association rules table
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(ALARM CLOCK BAKELIKE ORANGE),(ALARM CLOCK BAKELIKE GREEN),0.05042,1.0,9.916667
1,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE ORANGE),0.05042,0.5,9.916667
2,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE GREEN),0.067227,0.8,7.933333
3,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE PINK),0.067227,0.666667,7.933333
4,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN),0.07563,0.9,8.925
5,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED ),0.07563,0.75,8.925
6,(ALARM CLOCK BAKELIKE GREEN),(ROUND SNACK BOXES SET OF4 WOODLAND ),0.07563,0.75,3.880435
7,(ROUND SNACK BOXES SET OF4 WOODLAND ),(ALARM CLOCK BAKELIKE GREEN),0.07563,0.391304,3.880435
8,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE PINK),0.067227,0.8,9.52
9,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE RED ),0.067227,0.8,9.52
