# Data and import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
products = pd.read_csv('instacart_2017_05_01/products.csv')
order_products = pd.read_csv('instacart_2017_05_01/order_products__train.csv')
aisles = pd.read_csv('instacart_2017_05_01/aisles.csv')

In [3]:
order_products = order_products[['order_id', 'product_id']]

In [4]:
order_products.head()

Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633


### Merging data

In [5]:
order_products = order_products.merge(products, on='product_id', how='inner')

In [6]:
order_products.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,department_id
0,1,49302,Bulgarian Yogurt,120,16
1,816049,49302,Bulgarian Yogurt,120,16
2,1242203,49302,Bulgarian Yogurt,120,16
3,1383349,49302,Bulgarian Yogurt,120,16
4,1787378,49302,Bulgarian Yogurt,120,16


In [8]:
aisles.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [9]:
order_products = order_products.merge(aisles, on='aisle_id', how='inner')

In [10]:
order_products.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,department_id,aisle
0,1,49302,Bulgarian Yogurt,120,16,yogurt
1,816049,49302,Bulgarian Yogurt,120,16,yogurt
2,1242203,49302,Bulgarian Yogurt,120,16,yogurt
3,1383349,49302,Bulgarian Yogurt,120,16,yogurt
4,1787378,49302,Bulgarian Yogurt,120,16,yogurt


# Find the most paired items

In [11]:
order_products.head(20)

Unnamed: 0,order_id,product_id,product_name,aisle_id,department_id,aisle
0,1,49302,Bulgarian Yogurt,120,16,yogurt
1,816049,49302,Bulgarian Yogurt,120,16,yogurt
2,1242203,49302,Bulgarian Yogurt,120,16,yogurt
3,1383349,49302,Bulgarian Yogurt,120,16,yogurt
4,1787378,49302,Bulgarian Yogurt,120,16,yogurt
5,2445303,49302,Bulgarian Yogurt,120,16,yogurt
6,2853065,49302,Bulgarian Yogurt,120,16,yogurt
7,3231517,49302,Bulgarian Yogurt,120,16,yogurt
8,98,43654,Whole Milk Greek Blended Vanilla Bean Yogurt,120,16,yogurt
9,13746,43654,Whole Milk Greek Blended Vanilla Bean Yogurt,120,16,yogurt


In [12]:
order_products = order_products[['order_id', 'aisle']]
order_products.head(10)

Unnamed: 0,order_id,aisle
0,1,yogurt
1,816049,yogurt
2,1242203,yogurt
3,1383349,yogurt
4,1787378,yogurt
5,2445303,yogurt
6,2853065,yogurt
7,3231517,yogurt
8,98,yogurt
9,13746,yogurt


In [14]:
order_products.to_pickle("./order_products_smol.pkl")
order_products.shape[0]

1384617

In [25]:
order_products = pd.read_pickle("./order_products_smol.pkl")
order_products.shape[0]

32434489

In [15]:
import itertools
def find_pairings(aisles):
    if len(aisles) == 1:
        return None
    else:
        return [list(pairs) for pairs in itertools.combinations(aisles, 2)]

In [16]:
def row_count(df_list):
    rows = 0
    for df in df_list:
        rows += df.shape[0]
    return rows

In [17]:
import itertools
product_pairings = order_products.groupby('order_id').agg(find_pairings)
product_pairings.dropna(inplace=True)
product_pairings.head()

Unnamed: 0_level_0,aisle
order_id,Unnamed: 1_level_1
1,"[[yogurt, other creams cheeses], [yogurt, fres..."
36,"[[fresh vegetables, fresh vegetables], [fresh ..."
38,"[[fresh vegetables, packaged vegetables fruits..."
96,"[[fresh vegetables, packaged vegetables fruits..."
98,"[[yogurt, fresh vegetables], [yogurt, fresh ve..."


In [18]:
product_pairings.shape[0]

124364

In [19]:
product_pairings.to_pickle("pp_smol.pkl")

In [None]:
product_pairings.read_picle("pp_smol.pkl")

In [20]:
all_aisles = order_products['aisle'].unique()

In [22]:
import networkx as nx

g = nx.MultiGraph()
g.add_nodes_from(all_aisles)

for pairs in product_pairings['aisle']:
    g.add_edges_from(pairs)

In [23]:
M = nx.Graph()
for u,v,data in g.edges(data=True):
    w = data['weight'] if 'weight' in data else 1.0
    if M.has_edge(u,v):
        M[u][v]['weight'] += w
    else:
        M.add_edge(u, v, weight=w)

In [27]:
M.edges(data=True)

EdgeDataView([('yogurt', 'other creams cheeses', {'weight': 8576.0}), ('yogurt', 'fresh vegetables', {'weight': 87211.0}), ('yogurt', 'canned meat seafood', {'weight': 1663.0}), ('yogurt', 'fresh fruits', {'weight': 101020.0}), ('yogurt', 'packaged cheese', {'weight': 29156.0}), ('yogurt', 'specialty cheeses', {'weight': 2272.0}), ('yogurt', 'water seltzer sparkling water', {'weight': 18640.0}), ('yogurt', 'packaged vegetables fruits', {'weight': 50196.0}), ('yogurt', 'lunch meat', {'weight': 11884.0}), ('yogurt', 'oils vinegars', {'weight': 5039.0}), ('yogurt', 'fresh herbs', {'weight': 8181.0}), ('yogurt', 'refrigerated', {'weight': 15031.0}), ('yogurt', 'refrigerated pudding desserts', {'weight': 1246.0}), ('yogurt', 'canned jarred vegetables', {'weight': 7174.0}), ('yogurt', 'soy lactosefree', {'weight': 15993.0}), ('yogurt', 'crackers', {'weight': 13512.0}), ('yogurt', 'hot dogs bacon sausage', {'weight': 7327.0}), ('yogurt', 'frozen appetizers sides', {'weight': 5510.0}), ('yogur

In [44]:
weight_list = sorted(M.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)
weight_list

[('fresh vegetables', 'fresh fruits', {'weight': 308105.0}),
 ('fresh vegetables', 'fresh vegetables', {'weight': 224647.0}),
 ('fresh vegetables', 'packaged vegetables fruits', {'weight': 169766.0}),
 ('fresh fruits', 'packaged vegetables fruits', {'weight': 158522.0}),
 ('fresh fruits', 'fresh fruits', {'weight': 147079.0}),
 ('yogurt', 'fresh fruits', {'weight': 101020.0}),
 ('yogurt', 'fresh vegetables', {'weight': 87211.0}),
 ('fresh vegetables', 'packaged cheese', {'weight': 73818.0}),
 ('fresh fruits', 'packaged cheese', {'weight': 69250.0}),
 ('fresh vegetables', 'fresh herbs', {'weight': 53544.0}),
 ('fresh fruits', 'milk', {'weight': 53092.0}),
 ('yogurt', 'packaged vegetables fruits', {'weight': 50196.0}),
 ('fresh vegetables', 'milk', {'weight': 46253.0}),
 ('fresh fruits', 'water seltzer sparkling water', {'weight': 45582.0}),
 ('fresh fruits', 'chips pretzels', {'weight': 45322.0}),
 ('fresh vegetables', 'soy lactosefree', {'weight': 45295.0}),
 ('fresh fruits', 'soy lact

In [45]:
popular_pairings = []

for pair in weight_list:
    if pair[0] != pair[1]:
        popular_pairings.append(pair)

In [46]:
popular_pairings

[('fresh vegetables', 'fresh fruits', {'weight': 308105.0}),
 ('fresh vegetables', 'packaged vegetables fruits', {'weight': 169766.0}),
 ('fresh fruits', 'packaged vegetables fruits', {'weight': 158522.0}),
 ('yogurt', 'fresh fruits', {'weight': 101020.0}),
 ('yogurt', 'fresh vegetables', {'weight': 87211.0}),
 ('fresh vegetables', 'packaged cheese', {'weight': 73818.0}),
 ('fresh fruits', 'packaged cheese', {'weight': 69250.0}),
 ('fresh vegetables', 'fresh herbs', {'weight': 53544.0}),
 ('fresh fruits', 'milk', {'weight': 53092.0}),
 ('yogurt', 'packaged vegetables fruits', {'weight': 50196.0}),
 ('fresh vegetables', 'milk', {'weight': 46253.0}),
 ('fresh fruits', 'water seltzer sparkling water', {'weight': 45582.0}),
 ('fresh fruits', 'chips pretzels', {'weight': 45322.0}),
 ('fresh vegetables', 'soy lactosefree', {'weight': 45295.0}),
 ('fresh fruits', 'soy lactosefree', {'weight': 44034.0}),
 ('fresh vegetables', 'frozen produce', {'weight': 43945.0}),
 ('fresh fruits', 'frozen pr

In [49]:
import dill as pickle
pickle.dump(popular_pairings, open('pop_pair_smol.pkl', 'wb'))

In [3]:
import dill as pickle
popular_pairings = pickle.load(open('pop_pair_smol.pkl', 'rb'))

In [7]:
popular_pairings[:50]

[('fresh vegetables', 'fresh fruits', {'weight': 308105.0}),
 ('fresh vegetables', 'packaged vegetables fruits', {'weight': 169766.0}),
 ('fresh fruits', 'packaged vegetables fruits', {'weight': 158522.0}),
 ('yogurt', 'fresh fruits', {'weight': 101020.0}),
 ('yogurt', 'fresh vegetables', {'weight': 87211.0}),
 ('fresh vegetables', 'packaged cheese', {'weight': 73818.0}),
 ('fresh fruits', 'packaged cheese', {'weight': 69250.0}),
 ('fresh vegetables', 'fresh herbs', {'weight': 53544.0}),
 ('fresh fruits', 'milk', {'weight': 53092.0}),
 ('yogurt', 'packaged vegetables fruits', {'weight': 50196.0}),
 ('fresh vegetables', 'milk', {'weight': 46253.0}),
 ('fresh fruits', 'water seltzer sparkling water', {'weight': 45582.0}),
 ('fresh fruits', 'chips pretzels', {'weight': 45322.0}),
 ('fresh vegetables', 'soy lactosefree', {'weight': 45295.0}),
 ('fresh fruits', 'soy lactosefree', {'weight': 44034.0}),
 ('fresh vegetables', 'frozen produce', {'weight': 43945.0}),
 ('fresh fruits', 'frozen pr

In [6]:
popular_pairings[-50:]

[('bulk grains rice dried goods', 'baby accessories', {'weight': 1.0}),
 ('bulk grains rice dried goods', 'shave needs', {'weight': 1.0}),
 ('bulk grains rice dried goods', 'baby bath body care', {'weight': 1.0}),
 ('bulk grains rice dried goods', 'kitchen supplies', {'weight': 1.0}),
 ('bulk grains rice dried goods', 'body lotions soap', {'weight': 1.0}),
 ('bulk grains rice dried goods', 'red wines', {'weight': 1.0}),
 ('tofu meat alternatives', 'specialty wines champagnes', {'weight': 1.0}),
 ('shave needs', 'seafood counter', {'weight': 1.0}),
 ('shave needs', 'bulk dried fruits vegetables', {'weight': 1.0}),
 ('packaged seafood', 'muscles joints pain relief', {'weight': 1.0}),
 ('packaged seafood', 'beauty', {'weight': 1.0}),
 ('indian foods', 'specialty wines champagnes', {'weight': 1.0}),
 ('indian foods', 'facial care', {'weight': 1.0}),
 ('indian foods', 'spirits', {'weight': 1.0}),
 ('indian foods', 'red wines', {'weight': 1.0}),
 ('mint gum', 'kosher foods', {'weight': 1.0})

In [None]:
print("it worked out")

Maybe instead first do a big list first, then add edges from it