In [3]:
# default_exp algo.rs.matrix

%reload_ext autoreload
%autoreload 2

# 介绍
封装和rs相关的矩阵构建和分解的方法

# 共现矩阵
以item_item_matrix的构建为例

一般共现矩阵比较稀疏

这里需要定义 什么是共现
* 是只要在一个list中就算是共现、
* 还是在list的一个window中算共现、
* 还是别的(如必须在item1出现之前的item才算与其共现)

还需要定义共现的权重
* 共现一次 +1
* 还是别的 方式，如与共现的距离相关，离的越远权重越小

In [26]:
#export
import scipy.sparse as sp
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [24]:
#export

def build_item_item_matrix(items_list, window=9999,penalty1=False,penalty2=False,penalty3=1, save_dir=None):
    """
    
    :items_list:
        [
            [item1, item2], 
            [item1, item2, item4], 
            ...
        ]
    : window: int
        只有在window内才算共现
    : penalty1:
        对距离惩罚，距离越远，相关性越小
    : penalty2:
        对list长度惩罚，长度越长，对共现的价值越小
    : penalty3: float
        对seq方向惩罚，方向为正 不惩罚，否则惩罚
        1表示不惩罚
    :return:
        
    
    """
    
    items = pd.Series(list(set([ii for i in items_list for ii in i])))
    item2id = pd.Series(items.index, items)
    
    n_items = items.shape[0]
    print(f'n_items: {n_items}')
    train_data_matrix = sp.lil_matrix((n_items, n_items), dtype=np.float)
    for items_ in tqdm(items_list):
        for i, item in enumerate(items_):
            for j, related_item in enumerate(items_):
                distance = np.abs(i-j)
                if (item != related_item) and (distance<window):
                    vt = 1 
                    if penalty1:
                        vt /= np.log2(distance+1)
                    if penalty2:
                        vt /= np.log2(len(items_)+1)
                    if i < j:
                        vt *= penalty3
                    train_data_matrix[item2id.loc[item], item2id.loc[related_item]] += vt
    if save_dir:
        if not os.path.exists(save_dir):
            print(f'create matrix dir{save_dir}')
            os.mkdir(save_dir)
        items.to_pickle(os.path.join(save_dir, f'id2item_series_{penalty1}_{penalty2}_{penalty3}.pkl'))
        item2id.to_pickle(os.path.join(save_dir, f'item2id_series_{penalty1}_{penalty2}_{penalty3}.pkl'))
        sp.save_npz(os.path.join(save_dir, f'item_item_matrix_{penalty1}_{penalty2}_{penalty3}.npz'), train_data_matrix.tocsc())
        print(f'save matrix to {save_dir}, finished')
    return train_data_matrix, items, item2id

# nb_export

In [29]:
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 00_template.ipynb.
Converted algo_dl_keras.ipynb.
Converted algo_ml_shallow_tree_catboost.ipynb.
Converted algo_rs_associated_rules.ipynb.
Converted algo_rs_matrix.ipynb.
Converted algo_seq_embeding.ipynb.
Converted algo_seq_tfidf.ipynb.
Converted engineering_nbdev.ipynb.
Converted engineering_panel.ipynb.
Converted index.ipynb.


In [7]:
!nbdev_build_docs

No notebooks were modified
converting /Users/luoyonggui/PycharmProjects/nbdevlib/index.ipynb to README.md
