In [1]:
import math, os
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import *
import itertools

In [2]:
path = os.path.expanduser("~/blob/raw_datasets/steam/chatbot/user_history.tsv")

In [3]:
train_data = pd.read_csv(path)

In [4]:
train_data.head(3)

Unnamed: 0,user_id,item_id
0,1,1
1,1,2
2,1,3


In [5]:
history = train_data.groupby('user_id').agg(list)
uidict = history.to_dict()['item_id']

In [6]:
def itemCFTrain(df):
    
    #create list for dict
    user_item_list = []
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        user = (row['user_id'])
        item = (row['item_id'])
        user_item_list.append([user, item])
    
    #create dict
    user_item_dict = dict()
    for user, item in tqdm(user_item_list):
        user_item_dict.setdefault(user, list()) # changge set() to list()
        user_item_dict[user].append(item)
    
    return user_item_dict

In [9]:
def ItemMatrix_fn(user_item_dict):
    
    N = defaultdict(int)
    itemMatrix = defaultdict(int)
    for user, items in tqdm(user_item_dict.items()):
        for i in items:
            itemMatrix.setdefault(i, dict())
            N[i] += 1
            for j in items:
                itemMatrix[i].setdefault(j, 0)
                #if i==j:
                #    continue
                itemMatrix[i][j] += 1
    
    return itemMatrix, N

def ItemMatrix_fn2(n_item, user_item_dict):
    itemMatrix = np.zeros((n_item, n_item))
    # cosine sim
    for user, items in tqdm(user_item_dict.items()):
        if len(items) <= 1:
            print('user=', user)
            continue
        pairs = list(itertools.combinations(items, 2))
        x1, x2 = zip(*pairs)
        itemMatrix[x1,x2] += 1
        itemMatrix[x2,x1] += 1
    
    return itemMatrix

In [10]:
def ItemSimilarityMatrix_fn(ItemMatrix, N, normalize:bool=True):
    
    itemSimMatrix = defaultdict(int)
    # cosine sim
    
    for i, related_items in tqdm(ItemMatrix.items()):
        itemSimMatrix.setdefault(i, dict())
        for j, cij in related_items.items():
            itemSimMatrix[i].setdefault(j, 0)
            itemSimMatrix[i][j] = cij / math.sqrt(N[i] * N[j])
    
    # normalization
    if normalize:
        for i, relations in tqdm(itemSimMatrix.items()):
            max_num = relations[max(relations, key=relations.get)]
            if max_num == 0:
                continue
            itemSimMatrix[i] = {k : v / max_num for k, v in relations.items()}
    
    return itemSimMatrix


def ItemSimilarityMatrix_fn2(ItemMatrix):
    
    N = ItemMatrix.sum(axis=0)
    N_ = np.sqrt(np.outer(N,N)) + 1e-10
    # cosine sim
    sim = ItemMatrix / (N_  + 1e-10)
    
    return sim

In [11]:
n_items = train_data['item_id'].max() + 1
itemMatrix = ItemMatrix_fn2(n_items, uidict)

100%|██████████| 281204/281204 [01:18<00:00, 3591.81it/s] 


In [12]:
(itemMatrix.sum(0) - itemMatrix.sum(1)).sum()

0.0

In [13]:
item_cf_m = ItemSimilarityMatrix_fn2(itemMatrix)

In [14]:
item_sim = item_cf_m.astype(np.float16)

In [15]:
np.save(os.path.expanduser("~/blob/raw_datasets/steam/chatbot/item_sim.npy"), item_sim)