In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from apyori import apriori

In [2]:
# order_products = pd.concat([pd.read_csv('../data/order_products__prior.csv'),
#                             pd.read_csv('../data/order_products__train.csv')])
order_products = pd.read_csv('../data/order_products__train.csv')     
products = pd.read_csv('../data/products.csv')
orders = pd.read_csv('../data/orders.csv')
departments = pd.read_csv('../data/departments.csv')
aisles = pd.read_csv('../data/aisles.csv')

In [3]:
# Join products with aisles to get product aisle categories
product_aisles = pd.merge(products, aisles, on="aisle_id", how="left")

# Join product with department to get product department categories
product_aisles_department = pd.merge(product_aisles, departments, on="department_id", how="left")

# Join order_products with products to get product categories
order_products_categories = pd.merge(order_products, product_aisles_department, on="product_id", how="left")

In [4]:
order_products_categories.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,1,49302,1,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs
1,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,other creams cheeses,dairy eggs
2,1,10246,3,0,Organic Celery Hearts,83,4,fresh vegetables,produce
3,1,49683,4,0,Cucumber Kirby,83,4,fresh vegetables,produce
4,1,43633,5,1,Lightly Smoked Sardines in Olive Oil,95,15,canned meat seafood,canned goods


In [5]:
# Top 10 product categories ordered:
product_categories_count = (
    order_products_categories.groupby(["department"])
    .size()
    .reset_index(name="counts")
    .sort_values("counts", ascending=False)
)

In [6]:
product_categories_count.head(10)

Unnamed: 0,department,counts
19,produce,409087
7,dairy eggs,217051
20,snacks,118862
3,beverages,114046
10,frozen,100426
16,pantry,81242
2,bakery,48394
6,canned goods,46799
8,deli,44291
9,dry goods pasta,38713


In [7]:
# Form a list of top 10 categories
top_10_depts = list(product_categories_count.head(10)["department"])
print(top_10_depts)

# Filter order_products to only contain top 10 depts
order_products_filtered = order_products_categories[
    order_products_categories["department"].isin(top_10_depts)
]

order_products_filtered["qty"] = 1

['produce', 'dairy eggs', 'snacks', 'beverages', 'frozen', 'pantry', 'bakery', 'canned goods', 'deli', 'dry goods pasta']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  order_products_filtered["qty"] = 1


In [8]:
order_products_filtered["department"].unique()

array(['dairy eggs', 'produce', 'canned goods', 'beverages', 'deli',
       'snacks', 'pantry', 'frozen', 'bakery', 'dry goods pasta'],
      dtype=object)

In [9]:
order_products_filtered.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,qty
0,1,49302,1,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs,1
1,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,other creams cheeses,dairy eggs,1
2,1,10246,3,0,Organic Celery Hearts,83,4,fresh vegetables,produce,1
3,1,49683,4,0,Cucumber Kirby,83,4,fresh vegetables,produce,1
4,1,43633,5,1,Lightly Smoked Sardines in Olive Oil,95,15,canned meat seafood,canned goods,1


In [10]:
# save out base df for analysis with top 10 product categories
order_products_filtered.to_csv("../data/order_products_filtered.csv", index=None)

In [1]:
import pandas as pd
order_products_filtered = pd.read_csv("../data/order_products_filtered.csv")

In [18]:
order_products_filtered["product_name"].unique

<bound method Series.unique of 0                                       Bulgarian Yogurt
1          Organic 4% Milk Fat Whole Milk Cottage Cheese
2                                  Organic Celery Hearts
3                                         Cucumber Kirby
4                   Lightly Smoked Sardines in Olive Oil
                               ...                      
1218906                           Natural Artesian Water
1218907                             Twice Baked Potatoes
1218908                  Organic Unsweetened Almond Milk
1218909                             Creamy Peanut Butter
1218910                               Broccoli Florettes
Name: product_name, Length: 1218911, dtype: object>

In [2]:
order_products_filtered.head(5)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,qty
0,1,49302,1,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs,1
1,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,other creams cheeses,dairy eggs,1
2,1,10246,3,0,Organic Celery Hearts,83,4,fresh vegetables,produce,1
3,1,49683,4,0,Cucumber Kirby,83,4,fresh vegetables,produce,1
4,1,43633,5,1,Lightly Smoked Sardines in Olive Oil,95,15,canned meat seafood,canned goods,1


In [3]:
order_products_filtered.shape

(1218911, 10)

In [6]:
import dask.dataframe as dd
df = dd.from_pandas(order_products_filtered, chunksize=100000)

In [8]:
df

Unnamed: 0_level_0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,qty
npartitions=13,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,int64,int64,int64,int64,object,int64,int64,object,object,int64
100000,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...
1200000,...,...,...,...,...,...,...,...,...,...
1218910,...,...,...,...,...,...,...,...,...,...


In [9]:
order_products_filtered[0:1000]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,qty
200,915,36570,9,1,Town House Mediterranean Herb Pita Crackers,78,19,crackers,snacks,1
201,915,30723,10,1,Tuna Creations Herb & Garlic Tuna,95,15,canned meat seafood,canned goods,1
202,915,19977,11,0,Light & Fit Greek Cherry Yogurt,120,16,yogurt,dairy eggs,1
203,915,9804,12,0,Four Cheese Pizza,79,1,frozen pizza,frozen,1
204,915,37487,13,0,Pure Leaf Unsweetened Iced Tea,94,7,tea,beverages,1
...,...,...,...,...,...,...,...,...,...,...
395,1591,45061,9,0,Natural Vanilla Ice Cream,37,1,ice cream ice,frozen,1
396,1591,31215,10,1,Lemon Yogurt,120,16,yogurt,dairy eggs,1
397,1591,17758,11,0,Strawberry Rhubarb Yoghurt,120,16,yogurt,dairy eggs,1
398,1591,34358,12,1,Garlic,83,4,fresh vegetables,produce,1


In [14]:
basket= (
    df
    .groupby(["order_id", "product_name"])["qty"]
    .sum()
    .compute()
    .unstack()
    .reset_index()
    .fillna(0)
    .set_index("order_id")
)

ValueError: Unstacked DataFrame is too big, causing int32 overflow

In [13]:
basket

              order_id
product_name  1                                  Bag of Organic Bananas
              1                                        Bulgarian Yogurt
              1                                          Cucumber Kirby
              1                    Lightly Smoked Sardines in Olive Oil
              1           Organic 4% Milk Fat Whole Milk Cottage Cheese
                                              ...                      
qty           3421063                                                 1
              3421063                                                 1
              3421070                                                 1
              3421070                                                 1
              3421070                                                 1
Length: 2437822, dtype: object

In [11]:
basket2= (
    order_products_filtered[1000:2000]
    .groupby(["order_id", "product_name"])["qty"]
    .sum()
    .unstack()
    .reset_index()
    .fillna(0)
    .set_index("order_id")
)

In [13]:
basket2

product_name,0% Greek Strained Yogurt,100% Apple Juice,100% Carrot Juice,100% Cranberry Juice,100% Mighty Mango Juice Smoothie,100% Raw Coconut Water,100% Tangerine Juice,100% Whole Wheat Bread,34% Less Fat than Our Regular Pesto. Reduced Fat Pesto sauce with Basil,Aged White Cheddar Gluten-Free Baked Rice And Corn Puffs,...,Whole Strawberries,Wild Arugula,XL Emerald White Seedless Grapes,Yellow Bell Pepper,Yellow Corn Organic Tortillas,Yo Baby Organic Whole Milk Banana Mango Yogurt,YoBaby Peach Pear Yogurt,Yukon Gold Potatoes,ZBar Organic Chocolate Brownie Energy Snack,Zero Go-Go Mixed Berry Vitamin Water
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3484,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
