In [1]:
import pandas as pd

In [2]:
mma_mart = pd.read_csv('mma_mart.csv')

In [3]:
mma_mart.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department
0,1,49302,Bulgarian Yogurt,120,yogurt,16,dairy eggs
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,other creams cheeses,16,dairy eggs
2,1,10246,Organic Celery Hearts,83,fresh vegetables,4,produce
3,1,49683,Cucumber Kirby,83,fresh vegetables,4,produce
4,1,43633,Lightly Smoked Sardines in Olive Oil,95,canned meat seafood,15,canned goods


In [4]:
mma_mart.shape

(987259, 7)

In [5]:
mma_mart['department'].unique()

array(['dairy eggs', 'produce', 'canned goods', 'pantry', 'meat seafood',
       'bakery', 'personal care', 'snacks', 'breakfast', 'beverages',
       'deli', 'household', 'international', 'dry goods pasta', 'frozen',
       'babies', 'pets', 'alcohol', 'bulk', 'missing', 'other'],
      dtype=object)

In [6]:
mma_mart['aisle'].unique()

array(['yogurt', 'other creams cheeses', 'fresh vegetables',
       'canned meat seafood', 'fresh fruits', 'packaged cheese', 'eggs',
       'spices seasonings', 'oils vinegars', 'baking ingredients',
       'doughs gelatins bake mixes', 'spreads',
       'packaged vegetables fruits', 'soy lactosefree', 'poultry counter',
       'bread', 'breakfast bakery', 'cold flu allergy',
       'energy granola bars', 'breakfast bars pastries', 'chips pretzels',
       'trail mix snack mix', 'crackers', 'refrigerated',
       'energy sports drinks', 'salad dressing toppings',
       'prepared soups salads', 'milk', 'paper goods',
       'water seltzer sparkling water', 'kosher foods',
       'packaged poultry', 'instant foods', 'packaged produce',
       'cookies cakes', 'candy chocolate', 'body lotions soap',
       'dry pasta', 'laundry', 'air fresheners candles', 'frozen produce',
       'buns rolls', 'canned fruit applesauce', 'juice nectars',
       'granola', 'fresh herbs', 'baby food formul

In [7]:
# Define department keywords
refrigerated_departments = ['dairy eggs', 'deli', 'meat seafood']
frozen_departments = ['frozen']

In [8]:
# Filter data based on departments
ref_data = mma_mart[mma_mart['department'].isin(refrigerated_departments)]
frozen_data = mma_mart[mma_mart['department'].isin(frozen_departments)]
other_data = mma_mart[~mma_mart['department'].isin(refrigerated_departments + frozen_departments)]


In [9]:
print(f"Aisles in ref data: {ref_data['aisle'].unique()}")
print(f"Aisles in frozen data: {frozen_data['aisle'].unique()}")
print(f"Aisles in other data: {other_data['aisle'].unique()}")

Aisles in ref data: ['yogurt' 'other creams cheeses' 'packaged cheese' 'eggs'
 'soy lactosefree' 'poultry counter' 'prepared soups salads' 'milk'
 'packaged poultry' 'cream' 'meat counter' 'fresh dips tapenades' 'butter'
 'hot dogs bacon sausage' 'prepared meals' 'tofu meat alternatives'
 'lunch meat' 'refrigerated pudding desserts' 'specialty cheeses'
 'seafood counter' 'packaged seafood' 'packaged meat']
Aisles in frozen data: ['frozen produce' 'frozen meals' 'frozen meat seafood'
 'frozen appetizers sides' 'frozen breakfast' 'frozen breads doughs'
 'frozen vegan vegetarian' 'ice cream ice' 'frozen pizza' 'frozen dessert'
 'frozen juice']
Aisles in other data: ['fresh vegetables' 'canned meat seafood' 'fresh fruits'
 'spices seasonings' 'oils vinegars' 'baking ingredients'
 'doughs gelatins bake mixes' 'spreads' 'packaged vegetables fruits'
 'bread' 'breakfast bakery' 'cold flu allergy' 'energy granola bars'
 'breakfast bars pastries' 'chips pretzels' 'trail mix snack mix'
 'crackers

In [10]:
print(f"department in ref data: {ref_data['department'].unique()}")
print(f"department in frozen data: {frozen_data['department'].unique()}")


department in ref data: ['dairy eggs' 'meat seafood' 'deli']
department in frozen data: ['frozen']


In [11]:
# Define aisle and product keywords
refrigerated_aisles_keywords = ['fresh', 'meat', 'seafood', 'coolers', 'soft drinks', 'soft drink', 'beer']
frozen_aisles_keywords = ['frozen']

In [12]:
# Filter data based on refrigerated aisles
for keyword in refrigerated_aisles_keywords:
    ref_data = pd.concat([ref_data, other_data[other_data['aisle'].str.contains(keyword)]])
    other_data = other_data[~other_data['aisle'].str.contains(keyword)]

# Filter data based on frozen aisles
for keyword in frozen_aisles_keywords:
    frozen_data = pd.concat([frozen_data, other_data[other_data['aisle'].str.contains(keyword)]])
    other_data = other_data[~other_data['aisle'].str.contains(keyword)]

In [13]:
print(f"department in ref data: {ref_data['department'].unique()}")
print(f"department in frozen data: {frozen_data['department'].unique()}")


department in ref data: ['dairy eggs' 'meat seafood' 'deli' 'produce' 'household'
 'dry goods pasta' 'canned goods' 'pantry' 'alcohol' 'beverages']
department in frozen data: ['frozen']


In [14]:
print(f"Aisles in ref data: {ref_data['aisle'].unique()}")
print(f"Aisles in frozen data: {frozen_data['aisle'].unique()}")


Aisles in ref data: ['yogurt' 'other creams cheeses' 'packaged cheese' 'eggs'
 'soy lactosefree' 'poultry counter' 'prepared soups salads' 'milk'
 'packaged poultry' 'cream' 'meat counter' 'fresh dips tapenades' 'butter'
 'hot dogs bacon sausage' 'prepared meals' 'tofu meat alternatives'
 'lunch meat' 'refrigerated pudding desserts' 'specialty cheeses'
 'seafood counter' 'packaged seafood' 'packaged meat' 'fresh vegetables'
 'fresh fruits' 'air fresheners candles' 'fresh herbs' 'fresh pasta'
 'canned meat seafood' 'marinades meat preparation' 'beers coolers'
 'soft drinks']
Aisles in frozen data: ['frozen produce' 'frozen meals' 'frozen meat seafood'
 'frozen appetizers sides' 'frozen breakfast' 'frozen breads doughs'
 'frozen vegan vegetarian' 'ice cream ice' 'frozen pizza' 'frozen dessert'
 'frozen juice']


In [15]:
# List of aisles to be removed from ref_data and added to other_data
aisles_to_move_to_other = ['air fresheners candles', 'marinades meat preparation']

# Filter data from ref_data based on aisles_to_move_to_other
removed_data = ref_data[ref_data['aisle'].isin(aisles_to_move_to_other)]

# Update ref_data to exclude the removed data
ref_data = ref_data[~ref_data['aisle'].isin(aisles_to_move_to_other)]

# Concatenate the removed data to other_data
other_data = pd.concat([other_data, removed_data])

In [16]:
print(f"Aisles in ref data: {ref_data['aisle'].unique()}")
print(f"Aisles in frozen data: {frozen_data['aisle'].unique()}")

Aisles in ref data: ['yogurt' 'other creams cheeses' 'packaged cheese' 'eggs'
 'soy lactosefree' 'poultry counter' 'prepared soups salads' 'milk'
 'packaged poultry' 'cream' 'meat counter' 'fresh dips tapenades' 'butter'
 'hot dogs bacon sausage' 'prepared meals' 'tofu meat alternatives'
 'lunch meat' 'refrigerated pudding desserts' 'specialty cheeses'
 'seafood counter' 'packaged seafood' 'packaged meat' 'fresh vegetables'
 'fresh fruits' 'fresh herbs' 'fresh pasta' 'canned meat seafood'
 'beers coolers' 'soft drinks']
Aisles in frozen data: ['frozen produce' 'frozen meals' 'frozen meat seafood'
 'frozen appetizers sides' 'frozen breakfast' 'frozen breads doughs'
 'frozen vegan vegetarian' 'ice cream ice' 'frozen pizza' 'frozen dessert'
 'frozen juice']


In [17]:
# Get the unique values of the 'product_name' column
unique_product_names = other_data['product_name'].unique()

# Save the unique values to a text file with 'utf-8' encoding
with open('unique_product_names.txt', 'w', encoding='utf-8') as file:
    for name in unique_product_names:
        file.write(name + '\n')

# Print a message indicating where the file is saved
print("Unique product names have been saved to 'unique_product_names.txt'")

Unique product names have been saved to 'unique_product_names.txt'


In [18]:
# Reset index for the resulting dataframes
ref_data.reset_index(drop=True, inplace=True)
frozen_data.reset_index(drop=True, inplace=True)
other_data.reset_index(drop=True, inplace=True)

In [19]:
# Save 'frozen_df' as a CSV file
frozen_data.to_csv('frozen_df.csv', index=False)

# Save 'refrigerated_df' as a CSV file
ref_data.to_csv('refrigerated_df.csv', index=False)

# Save 'other_df' as a CSV file
other_data.to_csv('other_df.csv', index=False)

In [20]:
print(f"Aisles in ref data: {ref_data['aisle'].unique()}")
print(f"Aisles in frozen data: {frozen_data['aisle'].unique()}")


Aisles in ref data: ['yogurt' 'other creams cheeses' 'packaged cheese' 'eggs'
 'soy lactosefree' 'poultry counter' 'prepared soups salads' 'milk'
 'packaged poultry' 'cream' 'meat counter' 'fresh dips tapenades' 'butter'
 'hot dogs bacon sausage' 'prepared meals' 'tofu meat alternatives'
 'lunch meat' 'refrigerated pudding desserts' 'specialty cheeses'
 'seafood counter' 'packaged seafood' 'packaged meat' 'fresh vegetables'
 'fresh fruits' 'fresh herbs' 'fresh pasta' 'canned meat seafood'
 'beers coolers' 'soft drinks']
Aisles in frozen data: ['frozen produce' 'frozen meals' 'frozen meat seafood'
 'frozen appetizers sides' 'frozen breakfast' 'frozen breads doughs'
 'frozen vegan vegetarian' 'ice cream ice' 'frozen pizza' 'frozen dessert'
 'frozen juice']


In [21]:
produce_data = ref_data[ref_data['department']=='produce']
produce_products = produce_data['product_name'].unique()

In [22]:
len(produce_products)

883

In [23]:
produce_data.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department
218064,1,10246,Organic Celery Hearts,83,fresh vegetables,4,produce
218065,1,49683,Cucumber Kirby,83,fresh vegetables,4,produce
218066,1,13176,Bag of Organic Bananas,24,fresh fruits,4,produce
218067,1,47209,Organic Hass Avocado,24,fresh fruits,4,produce
218068,2,28985,Michigan Organic Kale,83,fresh vegetables,4,produce
