# Loading data

In [165]:
import pandas as pd
order_products__train = pd.read_csv("../instacart_2017_05_01/order_products__train.csv")
order_products__prior = pd.read_csv("../instacart_2017_05_01/order_products__prior.csv")
products = pd.read_csv("../instacart_2017_05_01/products.csv")
orders = pd.read_csv("../instacart_2017_05_01/orders.csv")

In [4]:
orders['eval_set'].unique()

array(['prior', 'train', 'test'], dtype=object)

# Train data

In [5]:
orders_train = orders[orders['eval_set'] == 'train'].reset_index()

In [6]:
orders_train.head()

Unnamed: 0,index,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,10,1187899,1,train,11,4,8,14.0
1,25,1492625,2,train,15,1,11,30.0
2,49,2196797,5,train,5,0,11,6.0
3,74,525192,7,train,21,2,11,6.0
4,78,880375,8,train,4,1,14,10.0


In [7]:
orders_train = orders_train[['user_id', 'order_id']]

In [13]:
orders_train['order_id'].unique().shape[0]

131209

In [14]:
order_products__train['order_id'].unique().shape[0]

131209

In [15]:
order_products__train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [16]:
order_products__train = order_products__train[['order_id', 'product_id', 'reordered']]

In [17]:
order_products__train = order_products__train.groupby('order_id')['product_id'].apply(list).reset_index()

In [18]:
order_products__train.head()

Unnamed: 0,order_id,product_id
0,1,"[49302, 11109, 10246, 49683, 43633, 13176, 472..."
1,36,"[39612, 19660, 49235, 43086, 46620, 34497, 486..."
2,38,"[11913, 18159, 4461, 21616, 23622, 32433, 2884..."
3,96,"[20574, 30391, 40706, 25610, 27966, 24489, 39275]"
4,98,"[8859, 19731, 43654, 13176, 4357, 37664, 34065..."


In [19]:
user_products__train = pd.merge(order_products__train, orders_train, on='order_id')

In [21]:
user_products__train.head()

Unnamed: 0,order_id,product_id,user_id
0,1,"[49302, 11109, 10246, 49683, 43633, 13176, 472...",112108
1,36,"[39612, 19660, 49235, 43086, 46620, 34497, 486...",79431
2,38,"[11913, 18159, 4461, 21616, 23622, 32433, 2884...",42756
3,96,"[20574, 30391, 40706, 25610, 27966, 24489, 39275]",17227
4,98,"[8859, 19731, 43654, 13176, 4357, 37664, 34065...",56463


In [22]:
user_products__train = user_products__train[['user_id', 'product_id']]

In [23]:
user_products__train.to_csv("user_products__train.csv", index_label=False)

In [24]:
user_products__train.shape

(131209, 2)

In [None]:
user_products__train = pd.read_csv("user_products__train.csv")
user_products__train.shape

# Prior data

In [28]:
orders_prior = orders[orders['eval_set'] == 'prior']
orders_prior['order_id'].unique().shape

(3214874,)

In [29]:
order_products__prior['order_id'].unique().shape[0]

3214874

In [35]:
user_products__prior = pd.merge(orders_prior, order_products__prior, on='order_id')
user_products__prior.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,prior,1,2,8,,196,1,0
1,2539329,1,prior,1,2,8,,14084,2,0
2,2539329,1,prior,1,2,8,,12427,3,0
3,2539329,1,prior,1,2,8,,26088,4,0
4,2539329,1,prior,1,2,8,,26405,5,0


In [37]:
user_products__prior = user_products__prior[['user_id', 'product_id']]
user_products__prior = user_products__prior.groupby(['user_id', 'product_id']).size().reset_index()
user_products__prior.head()

Unnamed: 0,user_id,product_id,0
0,1,196,10
1,1,10258,9
2,1,10326,1
3,1,12427,10
4,1,13032,3


In [39]:
user_products__prior = user_products__prior.rename(columns={0:'QTY'})

In [40]:
user_products__prior.to_csv('user_products__prior.csv', index_label=False)

In [168]:
user_products__prior = pd.read_csv('user_products__prior.csv')

# Sparse matrix

In [173]:
columns_to_cat = ['user_id', 'product_id']
for column in columns_to_cat:
    user_products__prior[column] = user_products__prior[column].astype('category')

In [113]:
from scipy.sparse import coo_matrix, save_npz
user_product_quantity = coo_matrix((user_products__prior["QTY"],
                                (user_products__prior["product_id"].cat.codes.copy(),
                                user_products__prior["user_id"].cat.codes.copy())))

# Here I'm using cat.codes to assign a coorinate sytem to users and product_id, then mappin the values onto a matrix.
# So the matrix values for (user, product_id) in cooridinate space will be the amound of product x user y bought
# Make a converter later to convert from user/product id numbers to matrix coordinates

save_npz('sparse.npz', user_product_quantity)

In [None]:
from scipy.sparse import load_npz
user_product_quantity = load_npz('sparse.npz').tocsr().astype(np.float32)

In [114]:
user_product_quantity = user_product_quantity.tocsr().astype(np.float32)

Check later if you should demean data like [here](https://beckernick.github.io/matrix-factorization-recommender/)

# Model

In [115]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(user_product_quantity, k=50)

In [117]:
Vt = Vt.T * sigma

# Predictions

### Converters

In [205]:
user_products__prior['product_id'].cat.codes

0             195
1           10254
2           10322
3           12423
4           13028
5           13172
6           14080
7           17118
8           25129
9           26083
10          26400
11          30444
12          35945
13          38920
14          39649
15          41779
16          46139
17          49224
18             22
19             78
20           1558
21           2001
22           2572
23           3150
24           4068
25           4954
26           5209
27           5319
28           5447
29           5866
            ...  
13307923    23888
13307924    23905
13307925    24848
13307926    25539
13307927    25832
13307928    26204
13307929    26498
13307930    26629
13307931    28150
13307932    28584
13307933    31124
13307934    31471
13307935    33123
13307936    33345
13307937    36050
13307938    37647
13307939    38159
13307940    38722
13307941    39208
13307942    40302
13307943    40388
13307944    40526
13307945    40984
13307946    41205
13307947  

In [197]:
# Put in id for product and user—get out coorinates for sparse matrix
user_to_coords = {user_id:i for i, user_id in enumerate(user_products__prior['user_id'].cat.categories)}
coords_to_product = {j:product_id for j, product_id in enumerate(user_products__prior['product_id'].cat.categories)}

### Recommender

In [201]:
def recommended_items(user_id, n=20):
    user_coord = user_to_coords[user_id]
    recommended_items = Vt[user_coord].dot(U.T)
    #top_picks = sorted(recommended_items, reverse=True)[:n]
    #This is a more efficient way of doing it, don't really care about order, just want top n
    top_pick_coords = np.argpartition(recommended_items, n)[-n:]
    product_coords = recommended_items[top_pick_coords]
    products = [coords_to_product[coord] for coord in product_coords]
    
    return product_coords

In [203]:
coords_to_product

{0: 1,
 1: 2,
 2: 3,
 3: 4,
 4: 5,
 5: 6,
 6: 7,
 7: 8,
 8: 9,
 9: 10,
 10: 11,
 11: 12,
 12: 13,
 13: 14,
 14: 15,
 15: 16,
 16: 17,
 17: 18,
 18: 19,
 19: 20,
 20: 21,
 21: 22,
 22: 23,
 23: 24,
 24: 25,
 25: 26,
 26: 27,
 27: 28,
 28: 29,
 29: 30,
 30: 31,
 31: 32,
 32: 33,
 33: 34,
 34: 35,
 35: 36,
 36: 37,
 37: 38,
 38: 39,
 39: 40,
 40: 41,
 41: 42,
 42: 43,
 43: 44,
 44: 45,
 45: 46,
 46: 47,
 47: 48,
 48: 49,
 49: 50,
 50: 51,
 51: 52,
 52: 53,
 53: 54,
 54: 55,
 55: 56,
 56: 57,
 57: 58,
 58: 59,
 59: 60,
 60: 61,
 61: 62,
 62: 63,
 63: 64,
 64: 65,
 65: 66,
 66: 67,
 67: 68,
 68: 69,
 69: 70,
 70: 71,
 71: 72,
 72: 73,
 73: 74,
 74: 75,
 75: 76,
 76: 77,
 77: 78,
 78: 79,
 79: 80,
 80: 81,
 81: 82,
 82: 83,
 83: 84,
 84: 85,
 85: 86,
 86: 87,
 87: 88,
 88: 89,
 89: 90,
 90: 91,
 91: 92,
 92: 93,
 93: 94,
 94: 95,
 95: 96,
 96: 97,
 97: 98,
 98: 99,
 99: 100,
 100: 101,
 101: 102,
 102: 103,
 103: 104,
 104: 105,
 105: 106,
 106: 107,
 107: 108,
 108: 109,
 109: 110,
 110: 11

In [202]:
recommended_items(2)

KeyError: 0.0001476311

In [198]:
test_dic = {'a':1, 'b':3, 'c':4}

In [200]:
test_dic['a','b']

KeyError: ('a', 'b')

In [176]:
test_array = np.array([5,4,1,3,0])

0,1,3,4,5

In [186]:
np.argpartition(test_array,-3)

array([4, 2, 3, 1, 0])

In [187]:
np.argpartition(test_array,-3)[-3:]

array([3, 1, 0])

In [188]:
test_array[np.argpartition(test_array,-3)[-3:]]

array([3, 4, 5])

In [189]:
sorted(test_array, reverse=True)[:3]

[5, 4, 3]

In [126]:
test_list = [i for i in range(20)]
test_list[-2:]

[18, 19]

In [None]:
def predict_products(user_id, n):
    scores = Vt[user_id].dot(U.T)
    top_n = np.argpartition(scores, -n)