In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
import plotly.figure_factory as ff


In [2]:
mma_mart = pd.read_csv('mma_mart.csv')

In [3]:
mma_mart.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department
0,1,49302,Bulgarian Yogurt,120,yogurt,16,dairy eggs
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,other creams cheeses,16,dairy eggs
2,1,10246,Organic Celery Hearts,83,fresh vegetables,4,produce
3,1,49683,Cucumber Kirby,83,fresh vegetables,4,produce
4,1,43633,Lightly Smoked Sardines in Olive Oil,95,canned meat seafood,15,canned goods


In [32]:
mma_mart.shape

(987259, 7)

In [4]:
unique_departments = mma_mart['department'].unique()

In [5]:
unique_aisles = mma_mart['aisle'].unique()

In [6]:
unique_departments


array(['dairy eggs', 'produce', 'canned goods', 'pantry', 'meat seafood',
       'bakery', 'personal care', 'snacks', 'breakfast', 'beverages',
       'deli', 'household', 'international', 'dry goods pasta', 'frozen',
       'babies', 'pets', 'alcohol', 'bulk', 'missing', 'other'],
      dtype=object)

In [7]:
unique_aisles

array(['yogurt', 'other creams cheeses', 'fresh vegetables',
       'canned meat seafood', 'fresh fruits', 'packaged cheese', 'eggs',
       'spices seasonings', 'oils vinegars', 'baking ingredients',
       'doughs gelatins bake mixes', 'spreads',
       'packaged vegetables fruits', 'soy lactosefree', 'poultry counter',
       'bread', 'breakfast bakery', 'cold flu allergy',
       'energy granola bars', 'breakfast bars pastries', 'chips pretzels',
       'trail mix snack mix', 'crackers', 'refrigerated',
       'energy sports drinks', 'salad dressing toppings',
       'prepared soups salads', 'milk', 'paper goods',
       'water seltzer sparkling water', 'kosher foods',
       'packaged poultry', 'instant foods', 'packaged produce',
       'cookies cakes', 'candy chocolate', 'body lotions soap',
       'dry pasta', 'laundry', 'air fresheners candles', 'frozen produce',
       'buns rolls', 'canned fruit applesauce', 'juice nectars',
       'granola', 'fresh herbs', 'baby food formul

In [8]:
refrigerated_departments = ['dairy eggs', 'deli', 'meat seafood']
frozen_departments = ['frozen']
other_data = mma_mart[~mma_mart['department'].isin(refrigerated_departments + frozen_departments)]

other_aisles = other_data['aisle'].unique()
other_departments = other_data['department'].unique()
other_departments


array(['produce', 'canned goods', 'pantry', 'bakery', 'personal care',
       'snacks', 'breakfast', 'beverages', 'household', 'international',
       'dry goods pasta', 'babies', 'pets', 'alcohol', 'bulk', 'missing',
       'other'], dtype=object)

In [9]:
# Keywords for refrigerated aisles
refrigerated_aisles_keywords = ['fresh', 'meat', 'seafood', 'coolers', 'soft drinks', 'soft drink', 'beer']
frozen_aisles_keywords = ['frozen']
                                
# Initialize arrays
refrigerated_aisles = []
frozen_aisles = []
normal_aisles = other_aisles
other_aisles=[]

# Iterate through the aisles and categorize them
for aisle in normal_aisles:
    if any(keyword in aisle.lower() for keyword in refrigerated_aisles_keywords):
        refrigerated_aisles.append(aisle)
    elif any(keyword in aisle.lower() for keyword in frozen_aisles_keywords):
        frozen_aisles.append(aisle)
    else:
        other_aisles.append(aisle)

In [10]:
frozen_aisles

[]

In [11]:
refrigerated_aisles

['fresh vegetables',
 'canned meat seafood',
 'fresh fruits',
 'air fresheners candles',
 'fresh herbs',
 'beers coolers',
 'soft drinks',
 'marinades meat preparation',
 'fresh pasta']

In [12]:
other_aisles

['spices seasonings',
 'oils vinegars',
 'baking ingredients',
 'doughs gelatins bake mixes',
 'spreads',
 'packaged vegetables fruits',
 'bread',
 'breakfast bakery',
 'cold flu allergy',
 'energy granola bars',
 'breakfast bars pastries',
 'chips pretzels',
 'trail mix snack mix',
 'crackers',
 'refrigerated',
 'energy sports drinks',
 'salad dressing toppings',
 'paper goods',
 'water seltzer sparkling water',
 'kosher foods',
 'instant foods',
 'packaged produce',
 'cookies cakes',
 'candy chocolate',
 'body lotions soap',
 'dry pasta',
 'laundry',
 'buns rolls',
 'canned fruit applesauce',
 'juice nectars',
 'granola',
 'baby food formula',
 'canned meals beans',
 'soup broth bouillon',
 'dog food care',
 'preserved dips spreads',
 'spirits',
 'coffee',
 'cereal',
 'asian foods',
 'soap',
 'popcorn jerky',
 'bulk dried fruits vegetables',
 'condiments',
 'nuts seeds dried fruit',
 'food storage',
 'oral hygiene',
 'canned jarred vegetables',
 'pasta sauce',
 'tea',
 'grains rice d

In [13]:
# Define the two entries you want to move
entry1 = 'air fresheners candles'
entry2 = 'marinades meat preparation'

# Check if the entries are in refrigerated_aisles
if entry1 in refrigerated_aisles:
    refrigerated_aisles.remove(entry1)
    other_aisles.append(entry1)

if entry2 in refrigerated_aisles:
    refrigerated_aisles.remove(entry2)
    other_aisles.append(entry2)


In [14]:
refrigerated_aisles

['fresh vegetables',
 'canned meat seafood',
 'fresh fruits',
 'fresh herbs',
 'beers coolers',
 'soft drinks',
 'fresh pasta']

In [15]:
other_products = mma_mart[
    ~mma_mart['department'].isin(refrigerated_departments + frozen_departments) &
    ~mma_mart['aisle'].isin(refrigerated_aisles)
]['product_name'].tolist()

In [16]:
other_products

['Garlic Powder',
 'Coconut Butter',
 'Natural Sweetener',
 'Original Unflavored Gelatine Mix',
 'All Natural No Stir Creamy Almond Butter',
 'Classic Blend Cole Slaw',
 'Lemons',
 'Organic Baby Spinach',
 'Organic Ezekiel 49 Bread Cinnamon Raisin',
 'Plain Pre-Sliced Bagels',
 'Honey/Lemon Cough Drops',
 'Chewy 25% Low Sugar Chocolate Chip Granola',
 'Oats & Chocolate Chewy Bars',
 "Kellogg's Nutri-Grain Apple Cinnamon Cereal",
 'Nutri-Grain Soft Baked Strawberry Cereal Breakfast Bars',
 "Kellogg's Nutri-Grain Blueberry Cereal",
 'Tiny Twists Pretzels',
 'Traditional Snack Mix',
 'Goldfish Cheddar Baked Snack Crackers',
 'Original Orange Juice',
 'Sugarfree Energy Drink',
 'Energy Drink',
 'Just Crisp, Parmesan',
 'Organic Raspberries',
 'Sensitive Toilet Paper',
 'Natural Artesian Water, Mini & Mobile',
 'Matzos, Thin, Tea',
 'Macaroni And Cheese',
 'Clementines',
 "Biscuits Orange Pim's",
 'Dairy Milk Fruit & Nut Chocolate Bar',
 'Apricot Preserves',
 'One Ply Choose A Size Big Roll

In [17]:
len(other_products)

459237

In [18]:
other_products = list(set(other_products))
len(other_products)

25515

In [19]:
other_products

['Coconut Water Organic',
 'Zero Lemon‑Lime Soda',
 'Bag of Organic Fuji Apples',
 'Chipotle Taco Seasoning Mix',
 'Sun Spray Lotion',
 'Light 50 Cranberry',
 'More Milk Plus Herbal capsules',
 'Apple Cinnamon Instant Oatmeal',
 'Gluten Free Chocolate Brownie Mix',
 'Turkey Jerky',
 "Shake 'N Pour Buttermilk Pancake Mix",
 'The Original Decaf Ground Coffee',
 'White Stevia Powder',
 'Milk Chocolate Toffee Bits',
 'Crispy Cheddar Crackers',
 'Butternut Squash Oatmeal with Maple Organic Baby Food',
 'Elbow Macaroni',
 'Horseradish Mustard',
 'Chunky Chocolate Chip Cookies',
 'Steak & Burger Seasoning',
 'After The Rain Liquid Laundry Detergent',
 'Baby Healing Ointment',
 'Unsweetened Iced Coffee',
 'Darjeeling Black Tea Blend',
 'Freezer Bag Gallon',
 'Funfetti Premium Cake Mix With Candy Bits',
 'Grissini Torinesi Thin Breadsticks',
 'Chunky Garden, Tomato, Garlic & Onion Pasta Sauce',
 'Grapeseed Spray Oil',
 'Cranberry Rescue Pastilles',
 'Moroccan Chickpea & Carrot Soup',
 'Creamy N

In [20]:
# Define refrigeration-related keywords
refrigeration_keywords = ["refrigerated", "dairy", "milk", "yogurt","cheese", "microwaveable","mushroom",
                          "mushrooms","frosting","frost"]

frozen_keywords = ["frozen dessert","ice cream","frozen"]

# Initialize a list to store "other products"
normal_products = []
other_frozen_products =[]
other_ref_products =[]

# Iterate through product names and filter based on keywords
for product_name in other_products:
    if not any(keyword in product_name.lower() for keyword in refrigeration_keywords or keyword in frozen_keywords):
        normal_products.append(product_name)

for product_name in other_products:
    if any(keyword in product_name.lower() for keyword in frozen_keywords):
        other_frozen_products.append(product_name)
        
for product_name in other_products:
    if any(keyword in product_name.lower() for keyword in refrigeration_keywords):
        other_ref_products.append(product_name)

In [21]:
normal_products

['Coconut Water Organic',
 'Zero Lemon‑Lime Soda',
 'Bag of Organic Fuji Apples',
 'Chipotle Taco Seasoning Mix',
 'Sun Spray Lotion',
 'Light 50 Cranberry',
 'Apple Cinnamon Instant Oatmeal',
 'Gluten Free Chocolate Brownie Mix',
 'Turkey Jerky',
 'The Original Decaf Ground Coffee',
 'White Stevia Powder',
 'Crispy Cheddar Crackers',
 'Butternut Squash Oatmeal with Maple Organic Baby Food',
 'Elbow Macaroni',
 'Horseradish Mustard',
 'Chunky Chocolate Chip Cookies',
 'Steak & Burger Seasoning',
 'After The Rain Liquid Laundry Detergent',
 'Baby Healing Ointment',
 'Unsweetened Iced Coffee',
 'Darjeeling Black Tea Blend',
 'Freezer Bag Gallon',
 'Funfetti Premium Cake Mix With Candy Bits',
 'Grissini Torinesi Thin Breadsticks',
 'Chunky Garden, Tomato, Garlic & Onion Pasta Sauce',
 'Grapeseed Spray Oil',
 'Cranberry Rescue Pastilles',
 'Moroccan Chickpea & Carrot Soup',
 'Creamy Natural Peanut Butter And Flaxseed',
 'Sonoma Brut Sparkling Wine',
 'Reading 3 Way Indoor 50/200/250 Watts 

In [22]:
frozen_df = mma_mart[(mma_mart['department'] == 'frozen')|
                     (mma_mart['product_name'].isin(other_frozen_products))]

In [23]:
frozen_df.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department
68,7,46802,Pineapple Chunks,116,frozen produce,1,frozen
100,11,30162,Teriyaki & Pineapple Chicken Meatballs,38,frozen meals,1,frozen
114,12,38050,All Natural Boneless Skinless Chicken Breasts,34,frozen meat seafood,1,frozen
118,12,29471,Combination Pizza Rolls,129,frozen appetizers sides,1,frozen
135,14,162,Organic Mini Homestyle Waffles,52,frozen breakfast,1,frozen


In [24]:
refrigerated_df = mma_mart[
    (~mma_mart.index.isin(frozen_df.index)) &
    ((mma_mart['department'].isin(refrigerated_departments ))|
    (mma_mart['aisle'].isin(refrigerated_aisles)) |
    (mma_mart['product_name'].isin(other_ref_products)))
]

In [25]:
refrigerated_df.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department
0,1,49302,Bulgarian Yogurt,120,yogurt,16,dairy eggs
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,other creams cheeses,16,dairy eggs
2,1,10246,Organic Celery Hearts,83,fresh vegetables,4,produce
3,1,49683,Cucumber Kirby,83,fresh vegetables,4,produce
4,1,43633,Lightly Smoked Sardines in Olive Oil,95,canned meat seafood,15,canned goods


In [26]:
other_df = mma_mart[~mma_mart.index.isin(frozen_df.index) & ~mma_mart.index.isin(refrigerated_df.index)]

In [27]:
other_df.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department
10,2,9327,Garlic Powder,104,spices seasonings,13,pantry
11,2,45918,Coconut Butter,19,oils vinegars,13,pantry
12,2,30035,Natural Sweetener,17,baking ingredients,13,pantry
14,2,40141,Original Unflavored Gelatine Mix,105,doughs gelatins bake mixes,13,pantry
15,2,1819,All Natural No Stir Creamy Almond Butter,88,spreads,13,pantry


In [28]:
other_df.shape

(441874, 7)

In [29]:
refrigerated_df.shape

(476874, 7)

In [30]:
frozen_df.shape

(68511, 7)

In [31]:
# Save 'frozen_df' as a CSV file
frozen_df.to_csv('frozen_df.csv', index=False)

# Save 'refrigerated_df' as a CSV file
refrigerated_df.to_csv('refrigerated_df.csv', index=False)

# Save 'other_df' as a CSV file
other_df.to_csv('other_df.csv', index=False)


In [33]:
refrigerated_df['department'].unique()

array(['dairy eggs', 'produce', 'canned goods', 'meat seafood', 'deli',
       'dry goods pasta', 'snacks', 'babies', 'alcohol', 'beverages',
       'pantry', 'breakfast', 'bakery', 'household', 'international',
       'personal care', 'other', 'missing', 'pets'], dtype=object)