### Import Libraries and Datasets

In [21]:
import pandas as pd
#pip install pandas-profiling
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv

from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

orders = pd.read_csv('data/orders.csv')
products = pd.read_csv('data/products.csv')
departments = pd.read_csv('data/departments.csv')
aisles = pd.read_csv('data/aisles.csv')

#traindata = orders[orders['eval_set'] == 'train']

order_products_train = pd.read_csv("data/order_products__train.csv")
order_products_prior = pd.read_csv("data/order_products__prior.csv")
order_products_prior2 = pd.read_csv("data/order_products_prior2.csv")
order_products_train2 = pd.read_csv("data/order_products__train2.csv")


### Combining and Transforming the Data

In [23]:
###Collaborative Filtering###
# Compile data that I want to look at for now
compiled = pd.merge(order_products_prior, orders[['order_id', 'user_id']], on='order_id', how='left')
compiled = pd.merge(compiled, products[['product_id', 'product_name', 'aisle_id','department_id']], on='product_id', how='left')
compiled = pd.merge(compiled, aisles[['aisle_id', 'aisle']], on='aisle_id', how='left' )
compiled = pd.merge(compiled, departments[['department_id', 'department']], on='department_id', how='left' )
compiled.dropna(inplace=True)

# Drop NA's from compiled data and remove all other attributes expect User and Product
UserProductName = compiled.drop(compiled[['add_to_cart_order', 'product_id', 'order_id', 'reordered','aisle_id','department_id', 'aisle', 'department']], 1)
UserProdAisleDept = compiled.drop(compiled[['add_to_cart_order','reordered', 'product_id','department_id', 'order_id', 'aisle_id']], 1)
UserAisle = compiled.drop(compiled[['add_to_cart_order','product_name','department','reordered', 'product_id','department_id', 'order_id', 'aisle_id']], 1)
ProductAisle = compiled.drop(compiled[['add_to_cart_order','reordered', 'user_id','department','product_name','department_id', 'order_id', 'aisle_id']], 1)

#UserId | Product_Name | Department
UserProductDept = compiled.drop(compiled[['add_to_cart_order', 'product_id', 'order_id', 'reordered','aisle_id','department_id', 'aisle']], 1)
UserFiltBakery = UserProductDept[UserProductDept['department'] == 'bakery']


### Building Matrixes

In [None]:
### Build Matrix out of User and Product Name columns ###
UserAisle_matrix = pd.get_dummies(UserAisle.set_index('user_id')['aisle']).max(level=0).sort_index()
UserBakery_matrix = pd.get_dummies(UserFiltBakery.set_index('user_id')['product_name']).max(level=0).sort_index()


-------------------------

### <span style="color:green">User must select a category or item</span> 

In [27]:
### What category are currently looking at? (Aisles? Which department?)
data_items = UserBakery_matrix
data_items

Unnamed: 0_level_0,100% Oatnut Bread,100% Stone Ground Whole Wheat Bread,100% Whole Grain 3 Seed Bread,100% Whole Grain Bread,100% Whole Grain Flaxseed Bread,100% Whole Wheat,100% Whole Wheat 8 Hot Dog Rolls,100% Whole Wheat Bagel,100% Whole Wheat Bagels,100% Whole Wheat Bread,...,Whole Wheat Walnut Bread,"Whole Wheat Wraps 12\""",Wild Blueberry Muffins,"Wraps, New York Deli-Style, Everything","Wraps, Turmeric",Yam Good Paleo Muffins,Yellow Corn Extra Thin Tortillas,Yellow Corn Organic Tortillas,Yellow Corn Tortillas,Yellow Soft Corn Tortillas 8 Count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
206196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
206198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
206199,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Similarity Calculation

In [28]:
# Create a new dataframe without the user ids.
#data_items = data.drop(user_id, 1)
# As a first step we normalize the user vectors to unit vectors.

magnitude = np.sqrt(np.square(data_items).sum(axis=1))
data_items = data_items.divide(magnitude, axis='index')
def calculate_similarity(data_items):
    """Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sim
# Build the similarity matrix
data_matrix = calculate_similarity(data_items)
data_matrix

Unnamed: 0,100% Oatnut Bread,100% Stone Ground Whole Wheat Bread,100% Whole Grain 3 Seed Bread,100% Whole Grain Bread,100% Whole Grain Flaxseed Bread,100% Whole Wheat,100% Whole Wheat 8 Hot Dog Rolls,100% Whole Wheat Bagel,100% Whole Wheat Bagels,100% Whole Wheat Bread,...,Whole Wheat Walnut Bread,"Whole Wheat Wraps 12\""",Wild Blueberry Muffins,"Wraps, New York Deli-Style, Everything","Wraps, Turmeric",Yam Good Paleo Muffins,Yellow Corn Extra Thin Tortillas,Yellow Corn Organic Tortillas,Yellow Corn Tortillas,Yellow Soft Corn Tortillas 8 Count
100% Oatnut Bread,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
100% Stone Ground Whole Wheat Bread,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
100% Whole Grain 3 Seed Bread,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
100% Whole Grain Bread,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
100% Whole Grain Flaxseed Bread,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.003387,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yam Good Paleo Muffins,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.0,0.000000
Yellow Corn Extra Thin Tortillas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003619,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.0,0.000000
Yellow Corn Organic Tortillas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003158,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.008219
Yellow Corn Tortillas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003400,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.0,0.000000


### <span style="color:green">User must select a category or item</span> 

In [30]:
### What item or aisle are we currently looking ? ###
print (data_matrix.loc['Wild Blueberry Muffins'].nlargest(10))
#print (data_matrix)


Wild Blueberry Muffins                 1.000000
Chocolate Chocolate Chip Muffins       0.112319
Banana Nut Muffins                     0.095364
Horns A Plenty Pastries                0.088360
8 Inch Pumpkin Pie                     0.075942
Sliced Blueberry Bagels                0.042261
Sliced Wheat Bread                     0.023079
Honey Wheat Enriched Bread             0.013362
100% Oatnut Bread                      0.000000
100% Stone Ground Whole Wheat Bread    0.000000
Name: Wild Blueberry Muffins, dtype: float64


-------------------------