In [12]:
import pandas as pd
import swifter
import json
from tqdm import tqdm
from os import path
import random
from nltk.tokenize import word_tokenize
import numpy as np
import csv
import importlib
import nltk
from config_nghia import model_name


In [13]:
try :
    config = getattr(importlib.import_module('config_nghia') , f"{model_name}Config")
except AttributeError:
    print(f"{model_name} not included!")
    exit()

In [14]:
print(config)

<class 'config_nghia.NRMSConfig'>


In [15]:
# !pip install swifter
# nltk.download('punkt')

In [16]:
def parse_row(row,category2int,entities2int,word2int):
        new_row = [row.news_id, 
                   category2int[row.category] if row.category in category2int else 0, 
                   category2int[row.subcategory] if row.subcategory in category2int else 0,
                   [0] * config.num_words_title, [0] * config.num_words_abstract,  # map word from titile and abstract to int
                   [0] * config.num_words_title, [0] * config.num_words_abstract]  # map entity from title and abstract to int
        local_entity_map = {}
        for e in json.loads(row.title_entities):
            # print(e)
            #SurfaceForms raw entity name of original text
            if e['Confidence'] > config.entity_confidence_threshold and e['WikidataId'] in entities2int:
                for x in ' '.join(e['SurfaceForms']).lower().split(): #surfaceformrs is the original text
                    local_entity_map[x] = entities2int[e['WikidataId']] #convert to one entity
        for e in json.loads(row.abstract_entities):
            if e['Confidence'] > config.entity_confidence_threshold and e['WikidataId'] in entities2int:
                for x in ' '.join(e['SurfaceForms']).lower().split():
                    # print(x)
                    local_entity_map[x] = entities2int[e['WikidataId']]
        try:
            #if w is entity
            for i , w in enumerate(word_tokenize(row.title.lower())):
                if w in word2int:
                    # print(w)
                    new_row[3][i] = word2int[w]
                    if w in local_entity_map:
                        new_row[5][i] = local_entity_map[w]
        except IndexError:
            pass
        try:
            # if n w is entity
            for i , w in enumerate(word_tokenize(row.abstract.lower())):
                if w in word2int:
                    new_row[4][i] = word2int[w]
                    if w in local_entity_map:
                        # print(w)
                        new_row[6][i] = local_entity_map[w]
        except IndexError:
            pass
        return pd.Series(new_row , 
                         index = ['id' , 'category' , 'subcategory' , 'title' , 'abstract' , 'title_entities' , 'abstract_entities'])

In [17]:
def convert_dict_to_csv(dict_data , columns ,file_name ):
    pd.DataFrame(dict_data.items() , columns = columns).to_csv(file_name , sep  = '\t', index = False)
                

In [18]:
# News ID
# Category
# SubCategory
# Title
# Abstract
# URL
# Title Entities (entities contained in the title of this news)
# Abstract Entities (entites contained in the abstract of this news)
def parse_news(sourse, target, category2int_path, word2int_path,
               entity2int_path, mode):
    ''' 
    parse news for training and testing
    args :
        sourse : path to news.tsv
        target : path to save news_precessed.tst
    if mode == 'train' :
        category2int_path and word2int_path and entity2int_path is path to save
    else mode == test
        category2int_path and word2int_path  and entity2int path is path to load from
    '''
    print('load from : ', sourse)
    news = pd.read_table(sourse,
                         header=None,
                         usecols=[0, 1, 2, 3, 4, 5, 6, 7],
                         names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])
    # check
    news = news[:1000]
    # drop []
    news['title_entities'].fillna('[]', inplace=True)  # replace NaN with []
    news['abstract_entities'].fillna('[]', inplace=True)  # replace NaN with []
    news.fillna(' ', inplace=True)  # replace NaN with ''
    # category2int = json.load(category2int_path)
    # sub_category2int = json.load(sub_category2int_path)
    if mode == 'train':
        category2int = {}
        word2int = {}
        entities2int = {}
        word2freq = {}
        entity2freq = {}
        
        for row in news.itertuples(index=False):
            # map category to int
            if row.category not in category2int:
                category2int[row.category] = len(category2int) + 1
            if row.subcategory not in category2int:
                category2int[row.subcategory] = len(category2int) + 1
            # create dictionary frequency of word
            for w in word_tokenize(row.title.lower()):
                if w not in word2freq:
                    word2freq[w] = +1
                else:
                    word2freq[w] += 1
            for w in word_tokenize(row.abstract.lower()):
                if w not in word2freq:
                    word2freq[w] = +1
                else:
                    word2freq[w] += 1
            # process entities
            # read json file from title entity entities of member in title
            for e in json.loads(row.title_entities):
                times = len(e['OccurrenceOffsets']) * e['Confidence']
                if e['WikidataId'] not in entity2freq:
                    entity2freq[e['WikidataId']] = times
                else:
                    entity2freq[e['WikidataId']] += times
            for e in json.loads(row.abstract_entities):
                times = len(e['OccurrenceOffsets']) * e['Confidence']
                if e['WikidataId'] not in entity2freq:
                    entity2freq[e['WikidataId']] = times
                else:
                    entity2freq[e['WikidataId']] += times

        for k, v in word2freq.items():
            if v > config.word_freq_threshold:
                word2int[k] = len(word2int) + 1 
        for k, v in entity2freq.items():
            if v > config.entity_freq_threshold:
                entities2int[k] = len(entities2int) + 1 # chi don gian dung freq de dem so lan suat hien nhung khong thuc su dung neu freq >=1 thi map ra mot so khong trung voi mot tu nao khac
        
        #swifter for apply faster
        
        def func(x): return parse_row(x, category2int,entities2int,word2int)
        parse_news = news.swifter.apply(func, axis=1)
        # print(parse_news)
        parse_news.to_csv(target, sep='\t', index=False)

        # save dictionary
        convert_dict_to_csv(
            category2int, ['category', 'id'], category2int_path)
        convert_dict_to_csv(word2int, ['word', 'id'], word2int_path)
        convert_dict_to_csv(entities2int, ['entity', 'id'], entity2int_path)
        #modify numword only use another dataset
        print('save to : ', target)
    elif mode == 'test':
        category2int = dict(pd.read_table(category2int_path).values.tolist())
        word2int = dict(pd.read_table(word2int_path , na_filter = False).values.tolist())
        entities2int = dict(pd.read_table(entity2int_path).values.tolist())
        
        parse_news = news.swifter.apply(lambda x : parse_row(x , category2int , word2int , entities2int) , axis = 1)
        parse_news.to_csv(target , sep = '\t' , index = False)
        
    else :
        raise Exception('mode must be train or test')
    return parse_news
    

# def parse_news(sourse , target, category2int_path , word2int_path ,
#                entity2int_path , mode):
category2int_path = './save_process/category2int.csv'
word2int_path = './save_process/word2int.csv'
entity2int_path = 'save_process/entity2int.csv'
target = './save_process/news_processed.tsv'
news_path = '/workspace/nabang1010/LBA_NLP/Recommendation_System/DATA/dev_small/news.tsv'
temp = parse_news(news_path, target, category2int_path,
                  word2int_path, entity2int_path, 'train')
# print(temp.shape)
temp.head()


load from :  /workspace/nabang1010/LBA_NLP/Recommendation_System/DATA/dev_small/news.tsv


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

save to :  ./save_process/news_processed.tsv


Unnamed: 0,id,category,subcategory,title,abstract,title_entities,abstract_entities
0,N55528,1,2,"[1, 2, 3, 4, 5, 6, 7, 5, 8, 6, 0, 0, 9, 0, 0, ...","[10, 1, 0, 5, 0, 5, 8, 11, 12, 1, 13, 14, 15, ...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,N18955,3,4,"[0, 19, 20, 0, 0, 21, 1, 0, 22, 23, 24, 25, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,N61837,5,6,"[1, 26, 19, 27, 22, 28, 29, 30, 1, 0, 19, 31, ...","[33, 0, 0, 0, 34, 35, 0, 19, 36, 37, 38, 1, 39...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,N53526,3,7,"[52, 44, 45, 53, 54, 18, 55, 22, 56, 57, 0, 58...","[52, 61, 62, 52, 44, 35, 63, 5, 8, 64, 45, 53,...","[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, ..."
4,N38324,3,4,"[56, 42, 71, 72, 19, 73, 74, 5, 75, 42, 35, 76...","[77, 78, 0, 5, 79, 80, 22, 35, 81, 82, 83, 84,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [19]:
'./save_process/word_embedding.csv'
word2int = pd.read_table(word2int_path , na_filter = False , index_col= 'word')
word2int.head()

Unnamed: 0_level_0,id
word,Unnamed: 1_level_1
the,1
brands,2
queen,3
elizabeth,4
",",5


In [20]:
glove_path =  '/workspace/nabang1010/LBA_NLP/Recommendation_System/REPO/NRMS-Pytorch/word2vec/glove/glove.6B.50d.txt'
def generate_word_embedding(sourse , target , word2int_path  , emb_dim  = 50):
    ''' use glove to generate word embedding
    args :
        sourse: path to glove file
        target: path to save word embedding
        word2int_path: path to load word2int
    '''
    word2int = pd.read_table(word2int_path , na_filter = False , index_col= 'word')
    # print(word2int)
    word2vec = {} 
    source_embedding = pd.read_table(sourse , 
                                     index_col= 0 , 
                                     sep = ' ', 
                                     header = None,
                                     quoting = csv.QUOTE_NONE , 
                                    #  names = range(config.word_embedding_dim), 
                                     names = range(emb_dim), 
                                     )
    source_embedding.index.rename('word' , inplace = True) # set the index is word2
    
    merged = word2int.merge(source_embedding , 
                            how = 'inner' , 
                            right_on= 'word' ,
                            left_on = 'word'
                            )
    merged.set_index('id' , inplace = True)
    #process miss index
    missed_index = np.setdiff1d(np.arange(len(word2int) +1) , # all index of word 
                                merged.index.values) #ouput miss index
    missed_embedding = pd.DataFrame(data = np.random.normal(
        size =( len(missed_index) , emb_dim)))
    missed_embedding['id'] = missed_index
    missed_embedding.set_index('id', inplace = True)
    merged = pd.concat([merged , missed_embedding]).sort_index()
    print(
    f'Rate of word missed in pretrained embedding: {(len(missed_index)-1)/len(word2int):.4f}'
    )
    np.save(target , merged.values)
    merged.head()
    # return merged
generate_word_embedding(glove_path , './save_process/word_embedding' , word2int_path)
# df.head()

Rate of word missed in pretrained embedding: 0.0165


In [21]:
from numpy import source


def transform_entity_embedding(source, target, entity2int_path):
    """
    Args:
        source: path of embedding file
        target: path of transformed embedding file in numpy format
        entity2int_path
    """
    entity_embedding = pd.read_table(source, header=None) # read embedding file
    entity_embedding['vector'] = entity_embedding.iloc[:,
                                                       1:101].values.tolist() # lay den 101 chieu
    entity_embedding = entity_embedding[[0, 'vector'
                                         ]].rename(columns={0: "entity"}) # reanme vector to entity

    entity2int = pd.read_table(entity2int_path) # read dict map entity to int
    merged_df = pd.merge(entity_embedding, entity2int,
                         on='entity').sort_values('id') # map entity to embedding in embeding file
    entity_embedding_transformed = np.random.normal(
        size=(len(entity2int) + 1, config.entity_embedding_dim)) # random  entity for what not in file
    for row in merged_df.itertuples(index=False):
        entity_embedding_transformed[row.id] = row.vector # what not in file will be random
        # print(entity_embedding_transformed[row.id])
    np.save(target, entity_embedding_transformed)
source = '/workspace/nabang1010/LBA_NLP/Recommendation_System/DATA/dev_small/entity_embedding.vec'
target = './save_process/entity_embedding'
entity2int_path = './save_process/entity2int.csv'
transform_entity_embedding(source, target, entity2int_path)

In [22]:
# Impression ID. The ID of an impression.
# User ID. The anonymous ID of a user.
# Time. The impression time with format "MM/DD/YYYY HH:MM:SS AM/PM".
# History. The news click history (ID list of clicked news) of this user before this impression. The clicked news articles are ordered by time.
# Impressions. List of news displayed in this impression and user's click behaviors on them (1 for click and 0 for non-click). The orders of news in a impressions have been shuffled.



def parse_behavior(target , source , user2int_path):
    df = pd.read_table(source , header = None,
                       names = ['impression_id' , 'user_id' , 'time' , 'history' , 'impressions'])
    #check
    df = df[:1000]
    df.history.fillna(' ' , inplace = True)
    df.impressions = df.impressions.apply(lambda x: x.split())
    user2int = {userid : i for i  , userid in enumerate(df.user_id.unique()) }
    pd.DataFrame(user2int.items() , columns = ['user', 'id']).to_csv(user2int_path ,sep = '\t' ,  index = False)
    df.user_id = df.user_id.apply(lambda x : user2int[x])

    for row in tqdm(df.itertuples() , desc = 'balance data'):
        # print(row.impressions)
        positive = iter([i for i in row.impressions if i.endswith('1')])
        # print(next(positive))
        negative = [i for i in row.impressions if i.endswith('0')]
        random.shuffle(negative)
        negative = iter(negative)
        # print(negative)
        # negative = iter(random.shuffle([i for i in row.impressions if i.endswith('0')]))   
        # luu y random.shuffle khong tra ve gia tri nao ma no shuffle luon cai list cho minh implca = true default  
        pairs = []
        try :
            while True :
                pair = [next(positive)]
                for _ in range(config.negative_sampling_ratio):
                    pair.append(next(negative))
                pairs.append(pair)
        except StopIteration:
            pass
        # print(row.Index)
        df.at[row.Index , 'impressions'] = pairs
        # print(pairs)
    df = df.explode('impressions').dropna(subset = ['impressions']).reset_index(drop = True) # tach tung doan pairs chua 1 
    # # df.impressions = df.impressions.str.split()
    df[['candidate_news' , 'clicked']] = pd.DataFrame(
        df.impressions.map(lambda x: (
            ' '.join(e.split('-')[0] for e in x),
            
            (' '.join(e.split('-')[1] for e in x))
                                     )).tolist()
    )
    df.to_csv(target , sep = '\t' , index = False ,
              columns= ['user_id' , 'history' , 'candidate_news' , 'clicked' ])
    
    return df
behavior2int_path = './save_process/behavior2int.csv'
source = '/workspace/nabang1010/LBA_NLP/Recommendation_System/DATA/dev_small/behaviors.tsv'
target = './save_process/behaviors_processed.tsv'
temp = parse_behavior(target , source , behavior2int_path)
# temp.head()
df = pd.read_table(target)
df.head()


    

balance data: 1000it [00:00, 18812.08it/s]


Unnamed: 0,user_id,history,candidate_news,clicked
0,0,N55189 N46039 N51741 N53234 N11276 N264 N40716...,N31958 N34130 N48740,1 0 0
1,1,N58715 N32109 N51180 N33438 N54827 N28488 N611...,N23513 N31958 N46976,1 0 0
2,2,N56253 N1150 N55189 N16233 N61704 N51706 N5303...,N5940 N42844 N5472,1 0 0
3,3,N63554 N49153 N28678 N23232 N43369 N58518 N444...,N15347 N45057 N24802,1 0 0
4,4,N51692 N18285 N26015 N22679 N55556,N5940 N62365 N23513,1 0 0
