# Data and import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [6]:
products = pd.read_csv('instacart_2017_05_01/products.csv')
order_products = pd.read_csv('instacart_2017_05_01/order_products__prior.csv')
aisles = pd.read_csv('instacart_2017_05_01/aisles.csv')

In [7]:
order_products = order_products[['order_id', 'product_id']]

In [8]:
order_products.head()

Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035


### Merging data

In [9]:
order_products = order_products.merge(products, on='product_id', how='left')

In [5]:
order_products.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,department_id
0,2,33120,Organic Egg Whites,86,16
1,2,28985,Michigan Organic Kale,83,4
2,2,9327,Garlic Powder,104,13
3,2,45918,Coconut Butter,19,13
4,2,30035,Natural Sweetener,17,13


In [6]:
aisles.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [10]:
order_products = order_products.merge(aisles, on='aisle_id', how='left')

In [8]:
order_products.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,department_id,aisle
0,2,33120,Organic Egg Whites,86,16,eggs
1,2,28985,Michigan Organic Kale,83,4,fresh vegetables
2,2,9327,Garlic Powder,104,13,spices seasonings
3,2,45918,Coconut Butter,19,13,oils vinegars
4,2,30035,Natural Sweetener,17,13,baking ingredients


# Find the most paired items

In [10]:
order_products.head(20)

Unnamed: 0,order_id,product_id,product_name,aisle_id,department_id,aisle
0,2,33120,Organic Egg Whites,86,16,eggs
1,2,28985,Michigan Organic Kale,83,4,fresh vegetables
2,2,9327,Garlic Powder,104,13,spices seasonings
3,2,45918,Coconut Butter,19,13,oils vinegars
4,2,30035,Natural Sweetener,17,13,baking ingredients
5,2,17794,Carrots,83,4,fresh vegetables
6,2,40141,Original Unflavored Gelatine Mix,105,13,doughs gelatins bake mixes
7,2,1819,All Natural No Stir Creamy Almond Butter,88,13,spreads
8,2,43668,Classic Blend Cole Slaw,123,4,packaged vegetables fruits
9,3,33754,Total 2% with Strawberry Lowfat Greek Strained...,120,16,yogurt


In [11]:
order_products = order_products[['order_id', 'aisle']]
order_products.head(10)

Unnamed: 0,order_id,aisle
0,2,eggs
1,2,fresh vegetables
2,2,spices seasonings
3,2,oils vinegars
4,2,baking ingredients
5,2,fresh vegetables
6,2,doughs gelatins bake mixes
7,2,spreads
8,2,packaged vegetables fruits
9,3,yogurt


In [12]:
order_products.to_pickle("./order_products.pkl")

In [25]:
order_products = pd.read_pickle("./order_products.pkl")
order_products.shape[0]

32434489

In [27]:
import itertools
def find_pairings(aisles):
    if len(aisles) == 1:
        return None
    else:
        return [list(pairs) for pairs in itertools.combinations(aisles, 2)]

In [28]:
def row_count(df_list):
    rows = 0
    for df in df_list:
        rows += df.shape[0]
    return rows

In [29]:
import itertools
product_pairings = order_products.groupby('order_id').agg(find_pairings)
product_pairings.dropna(inplace=True)
product_pairings.head()

Unnamed: 0_level_0,aisle
order_id,Unnamed: 1_level_1
2,"[[eggs, fresh vegetables], [eggs, spices seaso..."
3,"[[yogurt, soy lactosefree], [yogurt, packaged ..."
4,"[[breakfast bakery, cold flu allergy], [breakf..."
5,"[[fresh fruits, salad dressing toppings], [fre..."
6,"[[refrigerated, laundry], [refrigerated, air f..."


In [30]:
product_pairings.shape[0]

3058126

In [31]:
pp_p1, pp_p2, pp_p3, pp_p4, pp_p5 = np.array_split(product_pairings, 5)

In [32]:
split_pp = (pp_p1, pp_p2, pp_p3, pp_p4, pp_p5)
row_count(split_pp)

3058126

In [34]:
split_pp = (pp_p1, pp_p2, pp_p3, pp_p4, pp_p5)
for i, pp in enumerate(split_pp):
    filename = "pp_p" + str(i + 1) + ".pkl"
    pp.to_pickle(filename)

In [3]:
pp_p1 = pd.read_pickle("pp_p1.pkl")
pp_p2 = pd.read_pickle("pp_p2.pkl")
pp_p3 = pd.read_pickle("pp_p3.pkl")
pp_p4 = pd.read_pickle("pp_p4.pkl")
pp_p5 = pd.read_pickle("pp_p5.pkl")

In [4]:
product_pairings = pd.concat([pp_p1, pp_p2, pp_p3, pp_p4, pp_p5], axis=0)
product_pairings.shape[0]

3058126

In [12]:
all_aisles = order_products['aisle'].unique()

In [13]:
import networkx as nx

g = nx.MultiGraph()
g.add_nodes_from(all_aisles)

for pairs in product_pairings['aisle']:
    g.add_edges_from(pairs)

In [None]:
import dill
dill.dump(g, open('g-multigraph.pkl', 'wb'))

In [22]:
M = nx.Graph()
for u,v,data in g.edges(data=True):
    w = data['weight'] if 'weight' in data else 1.0
    if M.has_edge(u,v):
        M[u][v]['weight'] += w
    else:
        M.add_edge(u, v, weight=w)

KeyboardInterrupt: 

In [None]:
weight_list = sorted(M.edges(data=True), key=lambda (source,target, data): data['weight'], reverse=True)

In [None]:
dill.dump(weight_list, open('weight_list.pkl', 'w'))

In [None]:
print("it worked out")

Maybe instead first do a big list first, then add edges from it