In [291]:
from pandas.api.types import is_numeric_dtype, is_categorical_dtype, is_categorical
import dgl
import numpy as np
import pandas as pd
import torch
import pickle

In [292]:
def _series_to_tensor(series):
    if is_categorical(series):
        return torch.LongTensor(series.cat.codes.values.astype('int64'))
    else:       # numeric
        return torch.FloatTensor(series.values)

class PandasGraphBuilder(object):
    """Creates a heterogeneous graph from multiple pandas dataframes.
    Examples
    --------
    Let's say we have the following three pandas dataframes:
    User table ``users``:
    ===========  ===========  =======
    ``user_id``  ``country``  ``age``
    ===========  ===========  =======
    XYZZY        U.S.         25
    FOO          China        24
    BAR          China        23
    ===========  ===========  =======
    Game table ``games``:
    ===========  =========  ==============  ==================
    ``game_id``  ``title``  ``is_sandbox``  ``is_multiplayer``
    ===========  =========  ==============  ==================
    1            Minecraft  True            True
    2            Tetris 99  False           True
    ===========  =========  ==============  ==================
    Play relationship table ``plays``:
    ===========  ===========  =========
    ``user_id``  ``game_id``  ``hours``
    ===========  ===========  =========
    XYZZY        1            24
    FOO          1            20
    FOO          2            16
    BAR          2            28
    ===========  ===========  =========
    One could then create a bidirectional bipartite graph as follows:
    >>> builder = PandasGraphBuilder()
    >>> builder.add_entities(users, 'user_id', 'user')
    >>> builder.add_entities(games, 'game_id', 'game')
    >>> builder.add_binary_relations(plays, 'user_id', 'game_id', 'plays')
    >>> builder.add_binary_relations(plays, 'game_id', 'user_id', 'played-by')
    >>> g = builder.build()
    >>> g.number_of_nodes('user')
    3
    >>> g.number_of_edges('plays')
    4
    """
    def __init__(self):
        self.entity_tables = {}
        self.relation_tables = {}

        self.entity_pk_to_name = {}     # mapping from primary key name to entity name
        self.entity_pk = {}             # mapping from entity name to primary key
        self.entity_key_map = {}        # mapping from entity names to primary key values
        self.num_nodes_per_type = {}
        self.edges_per_relation = {}
        self.relation_name_to_etype = {}
        self.relation_src_key = {}      # mapping from relation name to source key
        self.relation_dst_key = {}      # mapping from relation name to destination key

    def add_entities(self, entity_table, primary_key, name):
        entities = entity_table[primary_key].astype('category')
        if not (entities.value_counts() == 1).all():
            raise ValueError('Different entity with the same primary key detected.')
        # preserve the category order in the original entity table
        entities = entities.cat.reorder_categories(entity_table[primary_key].values)

        self.entity_pk_to_name[primary_key] = name
        self.entity_pk[name] = primary_key
        #import ipdb;ipdb.set_trace()
        self.num_nodes_per_type[name] = entity_table[primary_key].nunique()
        self.entity_key_map[name] = entities
        self.entity_tables[name] = entity_table

    def add_binary_relations(self, relation_table, source_key, destination_key, name):
        src = relation_table[source_key].astype('category')
        src = src.cat.set_categories(
            self.entity_key_map[self.entity_pk_to_name[source_key]].cat.categories)
        dst = relation_table[destination_key].astype('category')
        dst = dst.cat.set_categories(
            self.entity_key_map[self.entity_pk_to_name[destination_key]].cat.categories)
        if src.isnull().any():
            raise ValueError(
                'Some source entities in relation %s do not exist in entity %s.' %
                (name, source_key))
        if dst.isnull().any():
            raise ValueError(
                'Some destination entities in relation %s do not exist in entity %s.' %
                (name, destination_key))

        srctype = self.entity_pk_to_name[source_key]
        dsttype = self.entity_pk_to_name[destination_key]
        etype = (srctype, name, dsttype)
        #import ipdb;ipdb.set_trace()
        self.relation_name_to_etype[name] = etype
        self.edges_per_relation[etype] = (src.cat.codes.values.astype('int64'), dst.cat.codes.values.astype('int64'))
        self.relation_tables[name] = relation_table
        self.relation_src_key[name] = source_key
        self.relation_dst_key[name] = destination_key

    def build(self):
        # Create heterograph
        graph = dgl.heterograph(self.edges_per_relation, self.num_nodes_per_type)
        return graph
    

In [293]:
def read_rat(filePath, primary_key):
    data = []
    with open(filePath, 'r') as f:
        for line in f:
            if line:
                lines = line.split("\t")
                user = int(lines[0])
                item = int(lines[1])
                score = float(lines[2])
                data.append({primary_key[0]:int(user), primary_key[1]:int(item), 'score':int(score)})
    data = sorted(data, key=lambda x: (x['user_id']))
    data = pd.DataFrame(data)
    return data



In [340]:
book_filepath = '/Users/yangrun/MyProjects/GA-DTCDR/Data/douban_book/ratings.dat'
movie_filepath = '/Users/yangrun/MyProjects/GA-DTCDR/Data/douban_movie/ratings.dat'

In [341]:
book = read_rat(book_filepath,['user_id', 'book_id'])
movie = read_rat(movie_filepath,['user_id', 'movie_id'])

In [342]:
tmp1 = set(book['user_id'])
tmp2 = set(movie['user_id'])
user_list = list(tmp1 & tmp2)
user_id_table = pd.DataFrame({'user_id':user_list})

In [343]:
book_data = book[book.user_id.isin(user_list)]
movie_data = movie[movie.user_id.isin(user_list)]
book_id_table = pd.DataFrame({'book_id':book_data['book_id'].unique()})
movie_id_table = pd.DataFrame({'movie_id':movie_data['movie_id'].unique()})

In [344]:
def gen_test_mask(data):
    data['train_mask'] = np.ones((len(data),), dtype=np.bool)
    data['test_mask'] = np.zeros((len(data),), dtype=np.bool)
    def train_test_split(df):
        if df.shape[0] > 1:
            df.iloc[-1, -1] = True
            df.iloc[-1, -2] = False
        return df
    data = data.groupby(['user_id'] ,group_keys=False).apply(train_test_split).sort_index()
    return data
    

In [345]:
builder = PandasGraphBuilder()
builder.add_entities(book_id_table, 'book_id', 'book')
builder.add_entities(movie_id_table, 'movie_id', 'movie')
builder.add_entities(user_id_table, 'user_id', 'user')
builder.add_binary_relations(book_data, 'user_id', 'book_id', 'rate')
builder.add_binary_relations(book_data, 'book_id', 'user_id', 'rated-by')
#builder.add_binary_relations(movie_data, 'user_id', 'movie_id', 'rate')
#builder.add_binary_relations(movie_data, 'movie_id', 'user_id', 'rated-by')
builder.add_binary_relations(movie_data, 'user_id', 'movie_id', 'view')
builder.add_binary_relations(movie_data, 'movie_id', 'user_id', 'view-by')
g = builder.build()

In [346]:
def gen_entity_dict(entity_table, primary_key):
    entities = entity_table[primary_key].astype('category')
    entities = entities.cat.reorder_categories(entity_table[primary_key].values)
    entity_dict = {k:v for v, k in entities.items()}
    return entity_dict

In [347]:
book_id_dict = gen_entity_dict(book_id_table, 'book_id')
movie_id_dict = gen_entity_dict(movie_id_table, 'movie_id')
user_id_dict = gen_entity_dict(user_id_table, 'user_id')

In [348]:
book_data['book_id'] = book_data['book_id'].map(book_id_dict)
movie_data['movie_id'] = movie_data['movie_id'].map(movie_id_dict)
book_data['user_id'] = book_data['user_id'].map(user_id_dict)
movie_data['user_id'] = movie_data['user_id'].map(user_id_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_data['book_id'] = book_data['book_id'].map(book_id_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data['movie_id'] = movie_data['movie_id'].map(movie_id_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_data['user_id'] = book_data['user_id'].map(user_id_dict)
A value is try

In [349]:
book_data = gen_test_mask(book_data)
movie_data = gen_test_mask(movie_data)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data['train_mask'] = np.ones((len(data),), dtype=np.bool)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['train_mask'] = np.ones((len(data),), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data['test_mask'] = np.zeros((len(data),), dtype=np.bool)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['test_mask'] = np.zeros((len(data),), dtype

In [304]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from stanfordcorenlp import StanfordCoreNLP
import pandas as pd
import tqdm as tqdm

MOVIE_FILE = '/Users/yangrun/Downloads/douban_datasettext/douban_dataset(text information)/movies_cleaned.txt'
BOOK_REVIEW_FILE = '/Users/yangrun/Downloads/douban_datasettext/douban_dataset(text information)/bookreviews_cleaned.txt'
BOOK_RATE = '/Users/yangrun/MyProjects/GA-DTCDR/Data/douban_book/ratings.dat'

USER_FILE = '/Users/yangrun/Downloads/douban_datasettext/douban_dataset(text information)/users_cleaned.txt'
nlp = StanfordCoreNLP('/Users/yangrun/Downloads/stanford-nlp/stanford-corenlp-4.2.2',lang='zh')
nltk.download('stopwords')

def is_chinese(uchar):
    """is this a chinese word?"""
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return True
    else:
        return False

def is_number(uchar):
    """is this unicode a number?"""
    if uchar >= u'\u0030' and uchar <= u'\u0039':
        return True
    else:
        return False

def is_alphabet(uchar):
    """is this unicode an English word?"""
    if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
        return True
    else:
        return False
    
def format_str(content,lag=1):
    #print(content)
    content_str = ''
    if lag==0: #English
       for i in content:
           if is_alphabet(i):
               content_str = content_str+i
    if lag==1: #Chinese
        for i in content:
            if is_chinese(i):
                content_str = content_str+i
    if lag==2: #Number
        for i in content:
            if is_number(i):
                content_str = content_str+i        
    return content_str











[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yangrun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [305]:
user = pd.read_csv(USER_FILE,sep='\t')
user['self_statement'] = user['self_statement'].fillna('缺失')
user['living_place'] = user['living_place'].fillna('缺失')
user = user.loc[user.UID.isin(user_list)]
user['UID'] = user['UID'].astype('category')
user['UID']= user['UID'].cat.reorder_categories(user_id_table['user_id'].values)

In [306]:
user['UID'] = user['UID'].astype('category')
user = user.loc[user.UID.isin(user_list)]

In [307]:
res = []
for uid , text1, text2 in zip(user['UID'], user['self_statement'], user['living_place']):
    str_cleaned = ''
    str_cleaned=format_str(text2,1)+format_str(text1,1)+str_cleaned
    words= nlp.word_tokenize(str_cleaned)
    res.append([uid,words])

    
documents = [TaggedDocument(tags=[str(i)],words=doc) for i, doc in res]
user_model = Doc2Vec(documents, vector_size=16, window=2, min_count=1,negative=30, workers=6)
#model.build_vocab(documents)

In [308]:
user_model.train(documents,total_examples=user_model.corpus_count, epochs=20)
user_model.save("Doc2vec_douban_user_16.model")



In [309]:
movie_info = pd.read_csv(MOVIE_FILE,sep='\t')
movie_info['UID'] = movie_info['UID'].astype('category')
#tmp_dict = {v:int(k) for k, v in movie_id_dict.items()}
movie_info['UID'] = movie_info['UID'].map(movie_id_dict)
movie_info = movie_info.dropna(subset=['UID'])
movie_info['UID'] = movie_info['UID'].astype('int')

In [310]:
for i in [1,2,3,4,5,7,11]:
    tag = movie_info.columns[i]
    movie_info[tag] = movie_info[tag].fillna('缺失')
    movie_info[tag] = movie_info[tag].apply(lambda x :format_str(x))
    
    res = []
    #import pdb;pdb.set_trace()
for uid , text1, text2, text3, text4, text5, text6, text7 in tqdm.tqdm(list(zip(movie_info['UID'], movie_info['name'], movie_info['director'],movie_info['summary'], movie_info['writer'], movie_info['country'], movie_info['language'], movie_info['tag']))):
    str_cleaned = ''
    str_cleaned=format_str(text1)+format_str(text2)+format_str(text3)+ format_str(text4)+format_str(text5)+format_str(text6)+format_str(text7)+str_cleaned
    words= nlp.word_tokenize(str_cleaned)
    res.append([uid,words])
documents = [TaggedDocument(tags=[str(i)],words=doc) for i, doc in res]




100%|██████████| 9555/9555 [02:52<00:00, 55.53it/s]


In [311]:
for i in (set(movie_id_dict.keys()) - set(movie_info['UID'].values)):
    documents.append(TaggedDocument(tags=[str(i)], words=['缺失值']))

In [312]:
movie_model = Doc2Vec(documents, vector_size=16, window=2, min_count=1,negative=30, workers=6)
movie_model.train(documents,total_examples=movie_model.corpus_count, epochs=20)



In [313]:
movie_model.save("Doc2vec_douban_movie_16.model")

In [314]:
movie_feature = []
key2index = movie_model.docvecs.key_to_index
for key in movie_id_dict.keys():
    movie_feature.append(movie_model.docvecs.vectors[key2index[str(key)]])

  key2index = movie_model.docvecs.key_to_index
  movie_feature.append(movie_model.docvecs.vectors[key2index[str(key)]])


In [315]:
book_rated = set()
with open(BOOK_RATE , 'r') as f:
    for line in f:
        if line:
            lines = line.split("\t")
            item = int(lines[1])
            if item in book_id_dict.keys():
                book_rated.add(book_id_dict[item])

book_document = {}
with open(BOOK_REVIEW_FILE, 'r') as f:
    lines = f.readlines()[1:]
for line in lines:
    cur = line.split("\t")
    book_id = int(cur[1].strip('""'))
    label = str(cur[3].strip('""'))
    review = str(cur[4].strip('""'))
    if not label : label = review
    str_cleaned = ''
    str_cleaned = format_str(label)+str_cleaned
    if book_id in book_rated:
        book_document[book_id] = str_cleaned
        
#6777缺失值   
book_document[6777] = '缺失值'
documents = [TaggedDocument(tags=[str(i)],words=doc) for i, doc in book_document.items()]

In [317]:
book_model = Doc2Vec(documents, vector_size=16, window=2, min_count=1,negative=30, workers=6)
book_model.train(documents,total_examples=book_model.corpus_count, epochs=20)



In [366]:
book_model.save("Doc2vec_douban_book_16.model")

In [318]:
book_feature = []
for key in book_id_dict.keys():
    #print(key)
    key2index = book_model.docvecs.key_to_index
    book_feature.append(book_model.docvecs.vectors[key2index[str(key)]])
    

  key2index = book_model.docvecs.key_to_index
  book_feature.append(book_model.docvecs.vectors[key2index[str(key)]])


In [350]:
book_test = book_data[book_data['test_mask']==True]
movie_test = movie_data[movie_data['test_mask']==True]
#book_test['book_id'] = book_test['book_id'].map(book_id_dict)
#book_test['user_id'] = book_test['user_id'].map(user_id_dict)
#movie_test['movie_id'] = movie_test['movie_id'].map(movie_id_dict)
#movie_test['user_id'] = movie_test['user_id'].map(user_id_dict)

In [351]:
tmp1_test = set(book_test['user_id'])
tmp2_test = set(movie_test['user_id'])
user_list_test = list(tmp1_test & tmp2_test)
book_test = book_test[book_test.user_id.isin(user_list_test)]
movie_test = movie_test[movie_test.user_id.isin(user_list_test)]

In [352]:
#for etype in [('user','rate','book'), ('book', 'rated-by','user')]:
#    edge_mask = torch.ones(g.number_of_edges(etype))
#    for u,v in zip(book_test['user_id'], book_test['book_id']):
#        eidx = g.edge_ids(u,v,etype = etype)
mask = {}
edge_mask = torch.ones(g.number_of_edges(('user','rate','book')))
for u,v in zip(book_test['user_id'], book_test['book_id']):
    eidx = g.edge_ids(u,v,etype = ('user','rate','book'))
    edge_mask[eidx]=0
mask[('user','rate','book')] = edge_mask.bool()
inv_edge_mask = torch.ones(g.number_of_edges(('book', 'rated-by','user')))
for u,v in zip(book_test['book_id'], book_test['user_id']):
    eidx = g.edge_ids(u,v,etype = ('book', 'rated-by','user'))        
    inv_edge_mask[eidx]=0
mask[('book', 'rated-by','user')] = inv_edge_mask.bool()


In [353]:

#edge_mask = torch.ones(g.number_of_edges(('user','rate','movie')))
#for u,v in zip(movie_test['user_id'], movie_test['movie_id']):
#    eidx = g.edge_ids(u,v,etype = ('user','rate','movie'))
#    edge_mask[eidx]=0
#mask[('user','rate','movie')] = edge_mask.bool()
#
#inv_edge_mask = torch.ones(g.number_of_edges(('movie', 'rated-by','user')))
#for u,v in zip(movie_test['movie_id'], movie_test['user_id']):
#    eidx = g.edge_ids(u,v,etype = ('movie', 'rated-by','user'))        
#    inv_edge_mask[eidx]=0
#mask[('movie', 'rated-by','user')] = inv_edge_mask.bool()




edge_mask = torch.ones(g.number_of_edges(('user','view','movie')))
for u,v in zip(movie_test['user_id'], movie_test['movie_id']):
    eidx = g.edge_ids(u,v,etype = ('user','view','movie'))
    edge_mask[eidx]=0
mask[('user','view','movie')] = edge_mask.bool()

inv_edge_mask = torch.ones(g.number_of_edges(('movie', 'view-by','user')))
for u,v in zip(movie_test['movie_id'], movie_test['user_id']):
    eidx = g.edge_ids(u,v,etype = ('movie', 'view-by','user'))        
    inv_edge_mask[eidx]=0
mask[('movie', 'view-by','user')] = inv_edge_mask.bool()

In [355]:
dgl.edge_subgraph(g, mask, preserve_nodes=True)

Graph(num_nodes={'book': 6777, 'movie': 9555, 'user': 2106},
      num_edges={('book', 'rated-by', 'user'): 94016, ('movie', 'view-by', 'user'): 967979, ('user', 'rate', 'book'): 94016, ('user', 'view', 'movie'): 967979},
      metagraph=[('book', 'user', 'rated-by'), ('user', 'book', 'rate'), ('user', 'movie', 'view'), ('movie', 'user', 'view-by')])

In [356]:
g = dgl.edge_subgraph(g, mask, preserve_nodes=True)

In [357]:
book_train = book_data[book_data['test_mask']==False]
movie_train = movie_data[movie_data['test_mask']==False]

In [358]:
book_trainDict = zip(book_train['user_id'].values, book_train['book_id'].values)
movie_trainDict = zip(movie_train['user_id'].values, movie_train['movie_id'].values)

In [359]:
book_size = book_data['book_id'].nunique()
movie_size = movie_data['movie_id'].nunique()
user = []
item_a = []
item_b = []
for u, v in zip(book_test['user_id'], book_test['book_id']):
    tmp_user = u
    tmp_item = []
    #import ipdb;ipdb.set_trace()
    item_id = v
    tmp_item.append(item_id)
    neglist = set()
    neglist.add(item_id)
    for t in range(99):
        j = np.random.randint(book_size)
        while (u, j) in book_trainDict or j in neglist:
            j = np.random.randint(book_size)
        neglist.add(j)
        tmp_item.append(j)
    item_b.append(tmp_item)
    user.append(tmp_user)
    
for u, v in zip(movie_test['user_id'], movie_test['movie_id']):
    tmp_user = u
    tmp_item = []
    #import ipdb;ipdb.set_trace()
    item_id = v
    tmp_item.append(item_id)
    neglist = set()
    neglist.add(item_id)
    for t in range(99):
        j = np.random.randint(movie_size)
        while (u, j) in movie_trainDict or j in neglist:
            j = np.random.randint(book_size)
        neglist.add(j)
        tmp_item.append(j)
    item_a.append(tmp_item)
    

In [361]:
g.ndata['s2v'] = {'user':torch.tensor(user_model.docvecs.vectors),
                     'movie':torch.from_numpy(np.array(movie_feature)),
                     'book': torch.from_numpy(np.array(book_feature))}

  g.ndata['s2v'] = {'user':torch.tensor(user_model.docvecs.vectors),


In [362]:
dataset = {'train_graph':g,
          'test_user':user,
          'test_item_a':item_a,
          'test_item_b':item_b}
with open('./data_s2v_v2.pkl', 'wb') as f:
    pickle.dump(dataset, f)



In [338]:
a = torch.rand((100,16))
b = torch.rand((100,16))
torch.stack([a,b]).permute(1,0,2).shape

torch.Size([100, 2, 16])

In [363]:
{'a':1,'b':2} + {'c':2,'d':5}

TypeError: unsupported operand type(s) for +: 'dict' and 'dict'

In [364]:
dict = {'a': 1, 'b': 2, 'b': '3'}

SyntaxError: can't use starred expression here (<ipython-input-365-8441f717ac2c>, line 1)