In [51]:
import pandas as pd
import swifter
import json
from tqdm import tqdm
from os import path
import random
from nltk.tokenize import word_tokenize
import numpy as np
import csv
import importlib
import nltk
from config_nghia import model_name


In [52]:
try :
    config = getattr(importlib.import_module('config_nghia') , f"{model_name}Config")
except AttributeError:
    print(f"{model_name} not included!")
    exit()

In [53]:
print(config)

<class 'config_nghia.NRMSConfig'>


In [54]:
# !pip install swifter
# nltk.download('punkt')

In [55]:
def parse_row(row,category2int,entities2int,word2int):
        new_row = [row.news_id, 
                   category2int[row.category] if row.category in category2int else 0, 
                   category2int[row.subcategory] if row.subcategory in category2int else 0,
                   [0] * config.num_words_title, [0] * config.num_words_abstract,  # map word from titile and abstract to int
                   [0] * config.num_words_title, [0] * config.num_words_abstract]  # map entity from title and abstract to int
        local_entity_map = {}
        for e in json.loads(row.title_entities):
            # print(e)
            #SurfaceForms raw entity name of original text
            if e['Confidence'] > config.entity_confidence_threshold and e['WikidataId'] in entities2int:
                for x in ' '.join(e['SurfaceForms']).split():
                    local_entity_map[x] = entities2int[e['WikidataId']] #convert to one entity
        for e in json.loads(row.abstract_entities):
            if e['Confidence'] > config.entity_confidence_threshold and e['WikidataId'] in entities2int:
                for x in ' '.join(e['SurfaceForms']).split():
                    local_entity_map[x] = entities2int[e['WikidataId']]
        try:
            #if w is entity
            for i , w in enumerate(word_tokenize(row.title.lower())):
                if w in word2int:
                    new_row[3][i] = word2int[w]
                    if w in local_entity_map:
                        new_row[5][i] = local_entity_map[w]
        except IndexError:
            pass
        try:
            # if n w is entity
            for i , w in enumerate(word_tokenize(row.abstract.lower())):
                if w in word2int:
                    new_row[4][i] = word2int[w]
                    if w in local_entity_map:
                        new_row[6][i] = local_entity_map[w]
        except IndexError:
            pass
        return pd.Series(new_row , 
                         index = ['id' , 'category' , 'subcategory' , 'title' , 'abstract' , 'title_entities' , 'abstract_entities'])

In [75]:
def convert_dict_to_csv(dict_data , columns ,file_name ):
    pd.DataFrame(dict_data.items() , columns = columns).to_csv(file_name , sep  = '\t', index = False)
                

In [86]:
# News ID
# Category
# SubCategory
# Title
# Abstract
# URL
# Title Entities (entities contained in the title of this news)
# Abstract Entities (entites contained in the abstract of this news)
def parse_news(sourse, target, category2int_path, word2int_path,
               entity2int_path, mode):
    ''' 
    parse news for training and testing
    args :
        sourse : path to news.tsv
        target : path to save news_precessed.tst
    if mode == 'train' :
        category2int_path and word2int_path and entity2int_path is path to save
    else mode == test
        category2int_path and word2int_path  and entity2int path is path to load from
    '''
    print('load from : ', sourse)
    news = pd.read_table(sourse,
                         header=None,
                         usecols=[0, 1, 2, 3, 4, 5, 6, 7],
                         names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])
    # check
    news = news[:50]
    # drop []
    news['title_entities'].fillna('[]', inplace=True)  # replace NaN with []
    news['abstract_entities'].fillna('[]', inplace=True)  # replace NaN with []
    news.fillna(' ', inplace=True)  # replace NaN with ''
    # category2int = json.load(category2int_path)
    # sub_category2int = json.load(sub_category2int_path)
    if mode == 'train':
        category2int = {}
        word2int = {}
        entities2int = {}
        word2freq = {}
        entity2freq = {}
        
        for row in news.itertuples(index=False):
            # map category to int
            if row.category not in category2int:
                category2int[row.category] = len(category2int) + 1
            if row.subcategory not in category2int:
                category2int[row.subcategory] = len(category2int) + 1
            # create dictionary frequency of word
            for w in word_tokenize(row.title.lower()):
                if w not in word2freq:
                    word2freq[w] = +1
                else:
                    word2freq[w] += 1
            for w in word_tokenize(row.abstract.lower()):
                if w not in word2freq:
                    word2freq[w] = +1
                else:
                    word2freq[w] += 1
            # process entities
            # read json file from title entity entities of member in title
            for e in json.loads(row.title_entities):
                times = len(e['OccurrenceOffsets']) * e['Confidence']
                if e['WikidataId'] not in entity2freq:
                    entity2freq[e['WikidataId']] = times
                else:
                    entity2freq[e['WikidataId']] += times
            for e in json.loads(row.abstract_entities):
                times = len(e['OccurrenceOffsets']) * e['Confidence']
                if e['WikidataId'] not in entity2freq:
                    entity2freq[e['WikidataId']] = times
                else:
                    entity2freq[e['WikidataId']] += times

        for k, v in word2freq.items():
            if v > config.word_freq_threshold:
                word2int[k] = len(word2int) + 1 
        for k, v in entity2freq.items():
            if v > config.entity_freq_threshold:
                entities2int[k] = len(entities2int) + 1 # chi don gian dung freq de dem so lan suat hien nhung khong thuc su dung neu freq >=1 thi map ra mot so khong trung voi mot tu nao khac
        
        #swifter for apply faster
        def func(x): return parse_row(x, category2int, word2int, entities2int)
        parse_news = news.swifter.apply(func, axis=1)
        parse_news = parse_news.to_csv(target, sep='\t', index=False)

        # save dictionary
        convert_dict_to_csv(
            category2int, ['category', 'id'], category2int_path)
        convert_dict_to_csv(word2int, ['word', 'id'], word2int_path)
        convert_dict_to_csv(entities2int, ['entity', 'id'], entity2int_path)
        #modify numword only use another dataset
        print('save to : ', target)
    elif mode == 'test':
        category2int = dict(pd.read_table(category2int_path).values.tolist())
        word2int = dict(pd.read_table(word2int_path , na_filter = False).values.tolist())
        entities2int = dict(pd.read_table(entity2int_path).values.tolist())
        
        parse_news = news.swifter.apply(lambda x : parse_row(x , category2int , word2int , entities2int) , axis = 1)
        parse_news.to_csv(target , sep = '\t' , index = False)
        
    else :
        raise Exception('mode must be train or test')
    return news
    

# def parse_news(sourse , target, category2int_path , word2int_path ,
#                entity2int_path , mode):
category2int_path = './save_process/category2int.csv'
word2int_path = './save_process/word2int.csv'
entity2int_path = 'save_process/entity2int.csv'
target = './save_process/news_processed.tsv'
news_path = '/workspace/nabang1010/LBA_NLP/Recommendation_System/DATA/dev_small/news.tsv'
temp = parse_news(news_path, target, category2int_path,
                  word2int_path, entity2int_path, 'train')
print(temp.shape)
temp.head()


load from :  /workspace/nabang1010/LBA_NLP/Recommendation_System/DATA/dev_small/news.tsv


Pandas Apply:   0%|          | 0/50 [00:00<?, ?it/s]

save to :  ./save_process/news_processed.tsv
(50, 8)


Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N18955,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [87]:
'./save_process/word_embedding.csv'
word2int = pd.read_table(word2int_path , na_filter = False , index_col= 'word')
word2int.head()

Unnamed: 0_level_0,id
word,Unnamed: 1_level_1
the,1
",",2
prince,3
and,4
by,5


In [100]:
glove_path =  '/workspace/nabang1010/LBA_NLP/Recommendation_System/REPO/NRMS-Pytorch/word2vec/glove/glove.6B.50d.txt'
def generate_word_embedding(sourse , target , word2int_path  , emb_dim  = 50):
    ''' use glove to generate word embedding
    args :
        sourse: path to glove file
        target: path to save word embedding
        word2int_path: path to load word2int
    '''
    word2int = pd.read_table(word2int_path , na_filter = False , index_col= 'word')
    # print(word2int)
    word2vec = {} 
    source_embedding = pd.read_table(sourse , 
                                     index_col= 0 , 
                                     sep = ' ', 
                                     header = None,
                                     quoting = csv.QUOTE_NONE , 
                                    #  names = range(config.word_embedding_dim), 
                                     names = range(emb_dim), 
                                     )
    source_embedding.index.rename('word' , inplace = True) # set the index is word2
    
    merged = word2int.merge(source_embedding , 
                            how = 'inner' , 
                            right_on= 'word' ,
                            left_on = 'word'
                            )
    merged.set_index('id' , inplace = True)
    #process miss index
    missed_index = np.setdiff1d(np.arange(len(word2int) +1) , # all index of word 
                                merged.index.values) #ouput miss index
    missed_embedding = pd.DataFrame(data = np.random.normal(
        size =( len(missed_index) , emb_dim)))
    missed_embedding['id'] = missed_index
    missed_embedding.set_index('id', inplace = True)
    merged = pd.concat([merged , missed_embedding]).sort_index()
    print(
    f'Rate of word missed in pretrained embedding: {(len(missed_index)-1)/len(word2int):.4f}'
    )
    np.save(target , merged.values)
    # return merged
df = generate_word_embedding(glove_path , './save_process/word_embedding.csv' , word2int_path)
df.head()

Rate of word missed in pretrained embedding: 0.0037313432835820895


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.233005,0.742352,0.716705,1.464897,1.441201,1.518781,-0.86283,-0.270792,0.535216,0.168893,...,1.370916,-1.104349,2.016391,-0.80599,-0.415807,-0.427005,-0.301433,0.12219,-0.015293,0.410114
1,0.418,0.24968,-0.41242,0.1217,0.34527,-0.044457,-0.49688,-0.17862,-0.00066,-0.6566,...,-0.29871,-0.15749,-0.34758,-0.045637,-0.44251,0.18785,0.002785,-0.18411,-0.11514,-0.78581
2,0.013441,0.23682,-0.16899,0.40951,0.63812,0.47709,-0.42852,-0.55641,-0.364,-0.23938,...,-0.080262,0.63003,0.32111,-0.46765,0.22786,0.36034,-0.37818,-0.56657,0.044691,0.30392
3,0.98846,1.4535,-0.53081,0.10509,0.84058,0.14018,0.066562,1.3341,-0.75813,-0.35223,...,0.34331,1.1836,-0.37197,-1.1069,0.000128,-0.18202,-1.3696,-1.497,0.40618,-0.42445
4,0.26818,0.14346,-0.27877,0.016257,0.11384,0.69923,-0.51332,-0.47368,-0.33075,-0.13834,...,-0.069043,0.36885,0.25168,-0.24517,0.25381,0.1367,-0.31178,-0.6321,-0.25028,-0.38097
