In [2]:
### Imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import gensim
import scipy.sparse as sparse
# Set seaborn color palette
colors = sns.color_palette("pastel")
# Import for checking th python version
import struct
%matplotlib inline
# Surprise related Imports
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf
import time

In [6]:
np.ones((60000, 16000))

MemoryError: 

In [3]:
filename_user_product_counts="../data/user_product_counts"
# Order datasets
df_user_product_counts = pd.read_csv(filename_user_product_counts,sep="\t",usecols=[1,2,3])



In [22]:

class LogisticMF():

    def __init__(self, counts, num_factors, reg_param=0.6, gamma=1.0,
                 iterations=30):
        self.counts = counts
        self.num_users = counts.shape[0]
        self.num_items = counts.shape[1]
        self.num_factors = num_factors
        self.iterations = iterations
        self.reg_param = reg_param
        self.gamma = gamma

    def train_model(self):

        self.ones = np.ones((self.num_users, self.num_items))
        self.user_vectors = np.random.normal(size=(self.num_users,
                                                   self.num_factors))
        self.item_vectors = np.random.normal(size=(self.num_items,
                                                   self.num_factors))
        self.user_biases = np.random.normal(size=(self.num_users, 1))
        self.item_biases = np.random.normal(size=(self.num_items, 1))

        user_vec_deriv_sum = np.zeros((self.num_users, self.num_factors))
        item_vec_deriv_sum = np.zeros((self.num_items, self.num_factors))
        user_bias_deriv_sum = np.zeros((self.num_users, 1))
        item_bias_deriv_sum = np.zeros((self.num_items, 1))
        for i in range(self.iterations):
            t0 = time.time()
            # Fix items and solve for users
            # take step towards gradient of deriv of log likelihood
            # we take a step in positive direction because we are maximizing LL
            user_vec_deriv, user_bias_deriv = self.deriv(True)
            user_vec_deriv_sum += np.square(user_vec_deriv)
            user_bias_deriv_sum += np.square(user_bias_deriv)
            vec_step_size = self.gamma / np.sqrt(user_vec_deriv_sum)
            bias_step_size = self.gamma / np.sqrt(user_bias_deriv_sum)
            self.user_vectors += vec_step_size * user_vec_deriv
            self.user_biases += bias_step_size * user_bias_deriv

            # Fix users and solve for items
            # take step towards gradient of deriv of log likelihood
            # we take a step in positive direction because we are maximizing LL
            item_vec_deriv, item_bias_deriv = self.deriv(False)
            item_vec_deriv_sum += np.square(item_vec_deriv)
            item_bias_deriv_sum += np.square(item_bias_deriv)
            vec_step_size = self.gamma / np.sqrt(item_vec_deriv_sum)
            bias_step_size = self.gamma / np.sqrt(item_bias_deriv_sum)
            self.item_vectors += vec_step_size * item_vec_deriv
            self.item_biases += bias_step_size * item_bias_deriv
            t1 = time.time()

            print ('iteration %i finished in %f seconds' % (i + 1, t1 - t0))

    def deriv(self, user):
        if user:
            vec_deriv = np.dot(self.counts, self.item_vectors)
            bias_deriv = np.expand_dims(np.sum(self.counts, axis=1), 1)

        else:
            vec_deriv = np.dot(self.counts.T, self.user_vectors)
            bias_deriv = np.expand_dims(np.sum(self.counts, axis=0), 1)
        A = np.dot(self.user_vectors, self.item_vectors.T)
        A += self.user_biases
        A += self.item_biases.T
        A = np.exp(A)
        A /= (A + self.ones)
        A = (self.counts + self.ones) * A

        if user:
            vec_deriv -= np.dot(A, self.item_vectors)
            bias_deriv -= np.expand_dims(np.sum(A, axis=1), 1)
            # L2 regularization
            vec_deriv -= self.reg_param * self.user_vectors
        else:
            vec_deriv -= np.dot(A.T, self.user_vectors)
            bias_deriv -= np.expand_dims(np.sum(A, axis=0), 1)
            # L2 regularization
            vec_deriv -= self.reg_param * self.item_vectors
        return (vec_deriv, bias_deriv)

    def log_likelihood(self):
        loglik = 0
        A = np.dot(self.user_vectors, self.item_vectors.T)
        A += self.user_biases
        A += self.item_biases.T
        B = A * self.counts
        loglik += np.sum(B)

        A = np.exp(A)
        A += self.ones

        A = np.log(A)
        A = (self.counts + self.ones) * A
        loglik -= np.sum(A)

        # L2 regularization
        loglik -= 0.5 * self.reg_param * np.sum(np.square(self.user_vectors))
        loglik -= 0.5 * self.reg_param * np.sum(np.square(self.item_vectors))
        return loglik

    def print_vectors(self):
        user_vecs_file = open('logmf-user-vecs-%i' % self.num_factors, 'w')
        for i in range(self.num_users):
            vec = ' '.join(map(str, self.user_vectors[i]))
            line = '%i\t%s\n' % (i, vec)
            user_vecs_file.write(line)
        user_vecs_file.close()
        item_vecs_file = open('logmf-item-vecs-%i' % self.num_factors, 'w')
        for i in range(self.num_items):
            vec = ' '.join(map(str, self.item_vectors[i]))
            line = '%i\t%s\n' % (i, vec)
            item_vecs_file.write(line)
        item_vecs_file.close()

In [19]:
df_user_product_counts.head()

Unnamed: 0,user_id,product_id,quantity
0,2,36287,1
1,13,4210,1
2,19,11749,1
3,24,46061,1
4,26,4091,1


In [20]:
users = list(np.sort(df_user_product_counts.user_id.unique()))
products = list(df_user_product_counts.product_id.unique())
quantity = list(df_user_product_counts.quantity)




In [21]:
col_indices = df_user_product_counts.user_id.astype('category', categories = users).cat.codes 
row_indices = df_user_product_counts.product_id.astype('category', categories = products).cat.codes 

product_user_matrix = sparse.csr_matrix((quantity, (row_indices, col_indices)), shape=(len(products), len(users)))
product_user_matrix

<16166x62844 sparse matrix of type '<class 'numpy.int64'>'
	with 98981 stored elements in Compressed Sparse Row format>

In [28]:
np.ones((60000,16000)

SyntaxError: unexpected EOF while parsing (<ipython-input-28-a7013129e2e4>, line 1)

In [5]:
def load_matrix(filename, num_users, num_items):
    t0 = time.time()
    counts = np.zeros((num_users, num_items))
    total = 0.0
    num_zeros = num_users * num_items
    for i, line in enumerate(open(filename, 'r')):
        user, item, count = line.strip().split('\t')
        user = int(user)
        item = int(item)
        count = float(count)
        counts[user][item] = count
        total += count
        num_zeros -= 1
    alpha = num_zeros / total
    print ('alpha %.2f' % alpha)
    counts *= alpha
    t1 = time.time()
    print ('Finished loading matrix in %f seconds' % (t1 - t0))
    return counts

In [6]:

filename="../data/user_product_counts"
count_matrix=load_matrix(filename, 62844 ,16166)
# return student


MemoryError: 

In [2]:
# Order datasets
df_order_products_prior = pd.read_csv("../data/order_products__prior.csv")
df_orders = pd.read_csv("../data/orders.csv")

In [3]:
# Consider only "prior" orders and remove all columns except order_id and user_id from df_orders
prior_order_user = df_orders.loc[df_orders.eval_set == "prior"]
prior_order_user = prior_order_user[["order_id", "user_id"]]

In [4]:
prior_order_user.head()

Unnamed: 0,order_id,user_id
0,2539329,1
1,2398795,1
2,473747,1
3,2254736,1
4,431534,1


In [5]:
# Remove all columns except order_id and user_id from df_orders
prior_order_product = df_order_products_prior[["order_id", "product_id"]]

In [6]:
prior_order_product.head()


Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035


In [7]:
df_merged_product_user = pd.merge(prior_order_user, prior_order_product, on="order_id")[["user_id", "product_id"]]

In [1]:
df_merged_product_user_sample=df_merged_product_user.sample(100000)

NameError: name 'df_merged_product_user' is not defined

In [9]:
df_merged_product_user_sample.head()

Unnamed: 0,user_id,product_id
5782440,36797,8277
31741008,201728,20588
30185485,191814,36695
8119842,51489,14819
4891618,31149,35547


In [10]:
df_user_product_counts = df_merged_product_user_sample.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})

In [11]:
df_user_product_counts.head()

Unnamed: 0,user_id,product_id,quantity
0,2,36287,1
1,13,4210,1
2,19,11749,1
3,24,46061,1
4,26,4091,1


In [14]:
df_user_product_counts.to_csv("../data/user_product_counts" , sep='\t')

In [23]:
user_len=len(df_user_product_counts.user_id.unique())
product_len=len(df_user_product_counts.product_id.unique())
print(user_len,product_len)



62844 16166


In [None]:
df_order_products_prior = pd.read_csv("../data/order_products__prior.csv")



In [7]:


# Load the movielens-100k dataset (download it if needed),
# and split it into 3 folds for cross-validation.
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)

# We'll use the famous SVD algorithm.
algo = SVD()

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

print_perf(perf)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /home/rahul/.surprise_data/ml-100k
Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9390
MAE:  0.7418
------------
Fold 2
RMSE: 0.9523
MAE:  0.7515
------------
Fold 3
RMSE: 0.9463
MAE:  0.7454
------------
------------
Mean RMSE: 0.9459
Mean MAE : 0.7462
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
MAE     0.7418  0.7515  0.7454  0.7462  
RMSE    0.9390  0.9523  0.9463  0.9459  


In [4]:
# For printing the Bit version for Python

print(struct.calcsize("P") * 8)

64


In [24]:
# Loading dataframes
df_departments = pd.read_csv("../data/departments.csv")
df_aisles = pd.read_csv("../data/aisles.csv")
df_order_products_prior = pd.read_csv("../data/order_products__prior.csv")
df_order_products_train = pd.read_csv("../data/order_products__train.csv")
df_order_products_test = pd.read_csv('../data/sample_submission.csv')
df_orders = pd.read_csv("../data/orders.csv")





In [4]:
df_order_products_train["product_id"] = df_order_products_train["product_id"].astype(str)
df_order_products_prior["product_id"] = df_order_products_prior["product_id"].astype(str)

In [22]:
df_order_products_train = df_order_products_train.groupby("order_id").apply(lambda order: order['product_id'].tolist())
df_order_products_prior = df_order_products_prior.groupby("order_id").apply(lambda order: order['product_id'].tolist())



KeyError: 'product_id'

In [6]:
df_order_products_train.head()


Unnamed: 0,order_id,0
0,1,"[49302, 11109, 10246, 49683, 43633, 13176, 472..."
1,36,"[39612, 19660, 49235, 43086, 46620, 34497, 486..."
2,38,"[11913, 18159, 4461, 21616, 23622, 32433, 2884..."
3,96,"[20574, 30391, 40706, 25610, 27966, 24489, 39275]"
4,98,"[8859, 19731, 43654, 13176, 4357, 37664, 34065..."


In [7]:
df_order_products_prior.head()

Unnamed: 0,order_id,0
0,2,"[33120, 28985, 9327, 45918, 30035, 17794, 4014..."
1,3,"[33754, 24838, 17704, 21903, 17668, 46667, 174..."
2,4,"[46842, 26434, 39758, 27761, 10054, 21351, 225..."
3,5,"[13176, 15005, 47329, 27966, 23909, 48370, 132..."
4,6,"[40462, 15873, 41897]"


In [19]:

sentences = df_order_products_prior.append(df_order_products_train)
longest = np.max(sentences.apply(len))
print(longest)

3346083


In [20]:
sentences = sentences.values

In [21]:

model = gensim.models.Word2Vec(sentences, size=100, window=longest, min_count=2, workers=4)

TypeError: unhashable type: 'list'

In [3]:
df_order_products_test.head()

Unnamed: 0,order_id,products
0,17,39276 29259
1,34,39276 29259
2,137,39276 29259
3,182,39276 29259
4,257,39276 29259


In [4]:
df_order_products_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [5]:
df_order_products_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
