In [84]:
import pandas as pd
import math
import networkx
import numpy as np
from scipy import sparse
from scipy import stats

In [85]:
items_transactions = pd.read_csv("items_transactions.csv")
items_descriptions = pd.read_csv("items_descriptions.csv")

In [86]:
# relevant columns: dept_num, item_price, qty_is_weight
items_transactions.head()

Unnamed: 0,global_transaction_id,item_id,dept_num,qty_sold,item_price,qty_is_weight,ticket_num,date,time_scanned
0,0,4889,6,3,99,0,2527,2020-07-01,07:03:30
1,0,3125,6,460,429,1,2527,2020-07-01,07:03:34
2,1,5,20,4,100,0,2528,2020-07-01,07:04:22
3,1,2,20,6,100,0,2528,2020-07-01,07:04:25
4,2,7013201045,1,1,299,0,2529,2020-07-01,07:05:06


In [87]:
# need to filter remove some items
# filter out non food items
items_transactions = items_transactions[(items_transactions.item_id != 9492206955)] #plastic bag
items_transactions = items_transactions[(items_transactions.item_id != 5555)] #BAG EB&W CUSTOMER
items_transactions = items_transactions[(items_transactions.item_id != 9746490305)] #JUBILEE BIG ROLL PAPER TOWELS
items_transactions = items_transactions[(items_transactions.item_id != 74870310181)] #store_brand paper napkins

# remove items containing "CRV" in their name
items_descriptions["CRV?"] = items_descriptions["description"].apply(lambda x: "Yes" if "CRV" in x else "No")
CRVs = items_descriptions[items_descriptions["CRV?"]=="Yes"]
CRVs = CRVs["item_id"].tolist()

for CRV in CRVs:
    items_transactions = items_transactions[(items_transactions.item_id != CRV)]

In [88]:
# relevant columns: description, category, item_type
items_descriptions.head()

Unnamed: 0,item_id,description,ecomm_description,category,item_type,upc,CRV?
0,1,PAN DULCE SENCILLO,"Mexican Sweet Bread/Pan Dulce Mexicano, 1 Count",20101020,0,10,No
1,2,BOLILLO FRENCH ROLLS,"Bolillo, French Rolls, 1 Count",20101210,0,20,No
2,3,BOLILLO QUESO/CHILE JALAP,"Jalapeño and Cheese Bolillo, 1 Count",20101210,0,30,No
3,4,EMPANADA,,20101020,0,40,No
4,5,MIni Bolillo,"BOLILLO SMALL, 2 OZ",20101210,0,50,No


In [89]:
pd.to_numeric(items_descriptions["item_id"])

0                    1
1                    2
2                    3
3                    4
4                    5
             ...      
48546    5071441509514
48547    7502228060147
48548    7503006910074
48549    8853682056661
48550    9076770700138
Name: item_id, Length: 48551, dtype: int64

In [90]:
pd.to_numeric(items_transactions["item_id"])

0                 4889
1                 3125
2                    5
3                    2
4           7013201045
              ...     
1293514    74870307001
1293515              1
1293516           4666
1293517    20411800000
1293518           4046
Name: item_id, Length: 1110716, dtype: int64

In [91]:
df = pd.merge(items_transactions, items_descriptions, on="item_id", how="inner")

In [92]:
df.head()

Unnamed: 0,global_transaction_id,item_id,dept_num,qty_sold,item_price,qty_is_weight,ticket_num,date,time_scanned,description,ecomm_description,category,item_type,upc,CRV?
0,0,4889,6,3,99,0,2527,2020-07-01,07:03:30,HERBS CILANTRO,"Cilantro, 1 bunch",6001075,0,48890,No
1,20,4889,6,3,99,0,2547,2020-07-01,07:21:42,HERBS CILANTRO,"Cilantro, 1 bunch",6001075,0,48890,No
2,25,4889,6,16,99,0,2553,2020-07-01,07:25:53,HERBS CILANTRO,"Cilantro, 1 bunch",6001075,0,48890,No
3,39,4889,6,1,99,0,2567,2020-07-01,07:53:46,HERBS CILANTRO,"Cilantro, 1 bunch",6001075,0,48890,No
4,46,4889,6,1,99,0,2575,2020-07-01,09:22:57,HERBS CILANTRO,"Cilantro, 1 bunch",6001075,0,48890,No


In [93]:
df.shape

(1104419, 15)

In [94]:
columns = ["dept_num", "item_price", "qty_is_weight", "category", "item_type", "description", "item_id"]
df2 = df[columns]

In [95]:
df2.head()

Unnamed: 0,dept_num,item_price,qty_is_weight,category,item_type,description,item_id
0,6,99,0,6001075,0,HERBS CILANTRO,4889
1,6,99,0,6001075,0,HERBS CILANTRO,4889
2,6,99,0,6001075,0,HERBS CILANTRO,4889
3,6,99,0,6001075,0,HERBS CILANTRO,4889
4,6,99,0,6001075,0,HERBS CILANTRO,4889


In [96]:
df2.shape

(1104419, 7)

In [97]:
from sklearn.cluster import KMeans

In [98]:
# k means clustering based on these columns
columns_2 = ["dept_num", "item_price", "qty_is_weight", "category", "item_type"]

In [99]:
X = df2[columns_2]

In [100]:
# still playing around with n_clusters
model = KMeans(n_clusters=150, random_state=1)

In [101]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [102]:
X_scaled = pd.DataFrame(X_scaled)

In [103]:
X_scaled.head()

Unnamed: 0,0,1,2,3,4
0,-0.327192,-0.660509,-0.683297,-0.344566,-0.010918
1,-0.327192,-0.660509,-0.683297,-0.344566,-0.010918
2,-0.327192,-0.660509,-0.683297,-0.344566,-0.010918
3,-0.327192,-0.660509,-0.683297,-0.344566,-0.010918
4,-0.327192,-0.660509,-0.683297,-0.344566,-0.010918


In [104]:
cluster = model.fit_predict(X_scaled)

In [105]:
cluster_assignments = pd.DataFrame(cluster)

In [107]:
cluster_assignments.head()

Unnamed: 0,0
0,21
1,21
2,21
3,21
4,21


In [108]:
df3 = pd.concat([df2["description"], df2["item_id"], cluster_assignments], axis=1)

In [109]:
df3.shape

(1104419, 3)

In [110]:
df4 = df3.drop_duplicates()

In [111]:
df4.head()

Unnamed: 0,description,item_id,0
0,HERBS CILANTRO,4889,21
16152,*CHILE HABANERO,3125,16
16158,*CHILE HABANERO,3125,108
16990,MIni Bolillo,5,5
20238,BOLILLO FRENCH ROLLS,2,5


In [114]:
# rename column 0
df4.rename(columns={0:"cluster"},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [115]:
df4.head()

Unnamed: 0,description,item_id,cluster
0,HERBS CILANTRO,4889,21
16152,*CHILE HABANERO,3125,16
16158,*CHILE HABANERO,3125,108
16990,MIni Bolillo,5,5
20238,BOLILLO FRENCH ROLLS,2,5


In [123]:
# need to drop more duplicates
df4[df4["item_id"] == 3125]

Unnamed: 0,description,item_id,cluster
16152,*CHILE HABANERO,3125,16
16158,*CHILE HABANERO,3125,108


In [125]:
c_maxes = df4.groupby(['description', 'item_id']).cluster.transform(max)
df4 = df4.loc[df4.cluster == c_maxes]

In [126]:
df4[df4["item_id"] == 3125]

Unnamed: 0,description,item_id,cluster
16158,*CHILE HABANERO,3125,108


In [127]:
df4.to_csv("kmeans_results.csv")

In [133]:
# for a given item_id, need to be able to get its cluster
df4[df4["item_id"] == 3125]["cluster"].tolist()[0]

108

In [32]:
# take a look at cluster 4
df4_cluster4 = df4[df4[0] == 4]
df4_cluster4

In [34]:
# take a look at cluster 10
df4_cluster10 = df4[df4[0] == 10]
df4_cluster10

In [36]:
#take a look at cluster 3
df4_cluster3 = df4[df4[0] == 3]
df4_cluster3

In [None]:
# loss function

In [71]:
# transition matrix/weights?
T = sparse.load_npz("sparse_adj_matrix.npz")
T.shape[0]

10047

In [72]:
# this is the weight of the edge from node 0 to node 1?
T[0,1]

0.012857446551239007

In [150]:
# inputs: 
# 1. item_id corresponding to node u (scalar) 
# 2. vector representations of all nodes Z (matrix formatted as a 2D numpy array)
# 3. k-means output (df4 above) (pandas dataframe)
def loss_function(item_id, Z, df4):
    
    # step 1: match item_id to row index (u)
    u = # not sure how to do this
    
    # step 2: set alpha 
    alpha = 0.5

    # step 3: go on random walk
    # u is the starting node
    p = np.zeros(T.shape[0])
    p[u] = 1
    p = p.reshape(-1,1)
    # define the random walk length, say 10
    walkLength = 10
    weights = []
    visited = []
    starting_node = u
    for k in range(walkLength):
        # evaluate the next state vector
        p = np.dot(T,p)
        # choose the node with higher probability as the visited node
        visited.append(np.argmax(p))
        weights.append(T[starting_node, np.argmax(p)])
        starting_node = np.argmax(p)
    # v is the node where you end up at the end of the random walk
    v=visited[-1]
    # calculate geometric mean of weights along random walk
    geometric_mean = stats.gmean(weights)
    
    # step 4: define sigmoid function
    def sigmoid(x):   
        z = 1/(1 + np.exp(-x)) 
    return z

    # step 5: calculate first term
    # assuming that row u of matrix Z is the vector representation for node u
    first_term = -(geometric_mean**alpha)*np.log(sigmoid(np.dot(Z[u,:],Z[v,:])))
    
    # step 6: calculate second term
    # recommended to use Q = 2, 3, 4, or 5 since we have a large dataset
    Q = 3 
    # get one positive sample and Q negative samples
    # positive sample is from u_cluster
    # negative samples are not from u_cluster
    
    # which cluster corresponding to node u?
    u_cluster = df4[df4["item_id"] == item_id]["cluster"].tolist()[0]
    
    positive_sampling_df = df4[df4["cluster"] == u_cluster]
    row = positive_sampling_df.sample()
    positive_item_id = row["item_id"].tolist()[0]
    
    negative_sampling_df = df4[df4["cluster"] != u_cluster]
    rows = negative_sampling_df.sample(n=3, replace=False)
    negative_item_ids = rows["item_id"].tolist()
    
    # need to get corresponding rows of Z to fill in below by matching item_ids to row indices
    positive_sample = np.log(sigmoid(np.dot(-Z[u,:],Z[,:])))
    negative_sample_1 = np.log(sigmoid(np.dot(-Z[u,:],Z[,:])))
    negative_sample_2 = np.log(sigmoid(np.dot(-Z[u,:],Z[,:])))
    negative_sample_3 = np.log(sigmoid(np.dot(-Z[u,:],Z[,:])))
    
    # then calculate expectation (this is just arithmetic mean in this case?)
    expectation = (positive_sample+negative_sample_1+negative_sample_2+negative_sample_3)/4
    second_term = Q*expectation
    
    # step 7: subtract second term from first term to get loss
    loss = first_term - second_term
    
    return loss


SyntaxError: invalid syntax (<ipython-input-150-a6bed1c1289a>, line 8)