### Implementation of Online Latent Ditichlet Allocation for Instacart dataset
In this notebook, we implement online LDA for Instacart data with users being documents and products being words.

In [None]:
from __future__ import print_function
from time import time
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

Following numebrs specify the propertise of LDA model that we want to implement.

In [None]:
n_samples = 2000 # number of subsamples to take at each iteration of online LDA
n_components = 10 # number of topics that we want to have
n_top_words = 20 # number of top words displayed in each tipic after training LDA

Following function displays top words in each topic after running LDA.

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += "-- ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

Following cells load data, and preprocess it for passing into lda function:

In [None]:
PATH = "~" # indicated the path to the location of Instacart data
#userIDprodName_df = pd.read_csv(PATH + "userIDprodName.csv", encoding = "ISO-8859-1")
transactions_df = pd.read_csv(PATH + "transactions.csv")
userIDprodName_df = transactions_df[['user_id','product_name']]
newProdName=pd.factorize(userIDprodName_df['product_name'])

prodID = newProdName[0]
prodIDindex = newProdName[1]
userIDprodNameprodID_df = pd.concat([userIDprodName_df.reset_index(drop=True), pd.DataFrame(prodID)], axis=1)
print(len(userIDprodNameprodID_df))

In [None]:
userIDprodNameprodID_df.columns = ['n', 'user_id', 'product_name', 'product_id']
#list(userIDprodNameprodID_df)
user_idx = userIDprodNameprodID_df['user_id'].unique()
ndocs = userIDprodNameprodID_df['user_id'].max()
nwords = userIDprodNameprodID_df['product_id'].max()
print("number of documents (users) and words (products) are: (" ndocs,nwords ")")

In [None]:
count_series = userIDprodNameprodID_df.groupby(['user_id', 'product_id']).size()

In [None]:
new_df = count_series.to_frame(name = 'size').reset_index()

In [None]:
newdf_sparsemat=csr_matrix((new_df['size'], (new_df['user_id'], new_df['product_id'])))

In [None]:
userIDprodNameprodID_df['product_name'].value_counts()

Now we are ready to run LDA.

In [None]:
lda = LatentDirichletAllocation(n_components=15, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(newdf_sparsemat)
print("done in %0.3fs." % (time() - t0))


We can look at the top word in each topic:

In [None]:
print("\nTopics in LDA model:")
#tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, prodIDindex, 7)