In [24]:
import re
import os
import gensim
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
import json
import time
from collections import Counter
from tqdm import tqdm

In [25]:
trainfilename = '../../data/train.tsv'
validfilename = '../../data/valid.tsv'
testfilename = '../../data/personalized_test.tsv'
docsfilename = '../../data/news.tsv'
stop_words = set(stopwords.words('english'))

In [26]:
WORD_FREQ_THRESHOLD = 3
MAX_CONTENT_LEN = 500
MAX_BODY_LEN = 100
MAX_TITLE_LEN = 16
WORD_EMBEDDING_DIM = 300
MAX_CLICK_LEN = 50

word2freq = {}
word2index = {}

In [27]:
def word_tokenize(sent):
    pat = re.compile(r'[\w]+|[.,!?;|]')
    if isinstance(sent, str):
        return pat.findall(sent.lower())
    else:
        return []

In [28]:
def read_news(filename,filer_num=3):
    news={}
    category, subcategory=[], []
    news_index={}
    index=1
    word_cnt=Counter()
    err = 0
    news_data = pd.read_csv(filename, sep='\t')
    news_data.fillna(value=" ", inplace=True)
    for i in tqdm(range(len(news_data))):
        doc_id,vert,_, title, snipplet= news_data.loc[i,:][:5]
        news_index[doc_id]=index
        index+=1

        title = title.lower()
        title = word_tokenize(title)
        snipplet = snipplet.lower()
        snipplet = word_tokenize(snipplet)
        category.append(vert)
        news[doc_id] = [vert,title,snipplet]     
        word_cnt.update(snipplet+title)
    # 0: pad; 1: <sos>; 2: <eos>
    word = [k for k , v in word_cnt.items() if v >= filer_num]
    word_dict = {k:v for k, v in zip(word, range(3,len(word)+3))}
    category=list(set(category))
    category_dict={k:v for k, v in zip(category, range(1,len(category)+1))}

    return news,news_index,category_dict,word_dict

In [29]:
%time news,news_index,category_dict,word_dict = read_news(docsfilename)

100%|██████████| 113762/113762 [01:19<00:00, 1434.62it/s]


Wall time: 2min 12s


In [30]:
word_dict['unk'] = 0
word_dict['<sos>'] = 1
word_dict['<eos>'] = 2

In [31]:
with open('../../data2/dict.pkl', 'wb') as f:
    pickle.dump([news_index,category_dict,word_dict], f)
with open('../../data2/news.pkl', 'wb') as f:
    pickle.dump(news, f)

## get inputs for user encoder

In [32]:
def get_rep_for_userencoder(news,news_index,category_dict,word_dict):
    news_num=len(news)+1
    news_title=np.zeros((news_num,MAX_TITLE_LEN),dtype='int32')
    news_body=np.zeros((news_num,MAX_BODY_LEN),dtype='int32')
    news_vert=np.zeros((news_num),dtype='int32')
    for key in news:    
        vert,title,body=news[key]
        doc_index=news_index[key]
        news_vert[doc_index] = category_dict[vert]
        counter = 0
        for word_id in range(min(MAX_TITLE_LEN,len(title))):
            if title[word_id] in word_dict:
                news_title[doc_index,counter]=word_dict[title[word_id].lower()]
                counter += 1
        counter = 0
        for word_id in range(min(MAX_BODY_LEN,len(body))):
            if body[word_id] in word_dict:
                news_body[doc_index,counter]=word_dict[body[word_id].lower()]
                counter += 1
    return news_vert, news_title, news_body

In [33]:
%time news_vert, news_title, news_body = get_rep_for_userencoder(news,news_index,category_dict,word_dict)

Wall time: 5.09 s


In [34]:
len(news_vert),len(news_title), len(news_body)

(113763, 113763, 113763)

In [35]:
np.save('../../data2/news_vert.npy', news_vert)
np.save('../../data2/news_title.npy', news_title)
np.save('../../data2/news_body.npy', news_body)

## get inputs/ targets for seq2seq model

In [36]:
def get_rep_for_seq2seq(news,news_index,word_dict):
    news_num=len(news)+1
    sources=np.zeros((news_num,MAX_CONTENT_LEN),dtype='int32')
    target_inputs=np.zeros((news_num,MAX_TITLE_LEN),dtype='int32')
    target_outputs=np.zeros((news_num,MAX_TITLE_LEN),dtype='int32')
    for key in tqdm(news):    
        _, title, body = news[key]
        doc_index=news_index[key]
        counter = 0
        for word_id in range(min(MAX_CONTENT_LEN-1,len(body))):
            if body[word_id] in word_dict:
                sources[doc_index,counter]=word_dict[body[word_id].lower()]
                counter += 1
        sources[doc_index,counter] = 2 
        
        target_inputs[doc_index,0] = 1
        counter = 1
        for word_id in range(min(MAX_TITLE_LEN-1,len(title))):
            if title[word_id] in word_dict:
                target_inputs[doc_index,counter]=word_dict[title[word_id].lower()]
                counter += 1
        
        counter = 0
        for word_id in range(min(MAX_TITLE_LEN-1,len(title))):
            if title[word_id] in word_dict:
                target_outputs[doc_index,counter]=word_dict[title[word_id].lower()]
                counter += 1
        target_outputs[doc_index,counter] = 2
        
    return sources, target_inputs, target_outputs

In [37]:
%time sources, target_inputs, target_outputs = get_rep_for_seq2seq(news,news_index,word_dict)

100%|██████████| 113762/113762 [00:18<00:00, 6098.27it/s]

Wall time: 18.7 s





In [38]:
np.save('../../data2/sources.npy', sources)
np.save('../../data2/target_inputs.npy', target_inputs)
np.save('../../data2/target_outputs.npy', target_outputs)

## get embedding matrix

In [39]:
def load_matrix(embedding_path,word_dict):
    mu, sigma = 0, 0.1
    embedding_zero = np.zeros((1,300))
    embedding_matrix = np.random.normal(mu, sigma, (len(word_dict)-1, WORD_EMBEDDING_DIM))
    embedding_matrix = np.concatenate((embedding_zero,embedding_matrix))
    have_word=[]
    with open(os.path.join(embedding_path,'glove.840B.300d.txt'),'rb') as f:
        while True:
            l=f.readline()
            if len(l)==0:
                break
            l=l.split()
            word = l[0].decode()
            if word in word_dict:
                index = word_dict[word]
                tp = [float(x) for x in l[1:]]
                embedding_matrix[index]=np.array(tp)
                have_word.append(word)
    return embedding_matrix,have_word

In [40]:
%time embedding_matrix, have_word = load_matrix('../../data',word_dict)

Wall time: 46.4 s


In [41]:
len(word_dict),len(have_word)

(141910, 100875)

In [42]:
np.save('../../data2/embedding_matrix.npy', embedding_matrix)

## get train/ valid/ test examples from user logs

In [43]:
def Doc2ID(doclist,news2id):
    return [news2id[i] for i in doclist if i in news2id ]

In [44]:
def PadDoc(doclist):
    if len(doclist) >= MAX_CLICK_LEN:
        return doclist[-MAX_CLICK_LEN:]
    else:
        return [0] * (MAX_CLICK_LEN-len(doclist)) + doclist[:MAX_CLICK_LEN]

In [45]:
def user2dict(users):
    user_set = set(users)
    user_dict = {k:v for k, v in zip(user_set, range(0,len(user_set)))}
    return user_dict

In [46]:
def parse_train_user(filename,news_index):
        
    df = pd.read_csv(filename, sep='\t')
    df.fillna(value=" ", inplace=True)
    
    df['ClicknewsID'] = df['ClicknewsID'].apply(lambda x: PadDoc(Doc2ID(x.split(),news_index)))
    
    df['pos']  = df['pos'].apply(lambda x: Doc2ID(x.split(),news_index))
    df['neg'] = df['neg'].apply(lambda x: Doc2ID(x.split(),news_index))
    
    pos_neg_lists = []
    for userindex, (pos_list, neg_list) in tqdm(enumerate(zip(df['pos'].values.tolist(), df['neg'].values.tolist()))):
        if len(pos_list) and len(neg_list):
            # sampling 1 negative sample for 1 pos sample
            min_len = min(len(pos_list), len(neg_list))
            np.random.shuffle(pos_list)
            np.random.shuffle(neg_list)
            for i in range(min_len):
                pos_neg_lists.append([userindex, [pos_list[i],neg_list[i]],[1,0]])
        
    return df['ClicknewsID'].values.tolist(), pos_neg_lists

In [47]:
%time TrainUsers, TrainSamples = parse_train_user(trainfilename, news_index)

400000it [00:06, 63572.94it/s] 


Wall time: 33.7 s


In [48]:
with open('../../data2/TrainUsers.pkl', 'wb') as f:
    pickle.dump(TrainUsers, f)
with open('../../data2/TrainSamples.pkl', 'wb') as f:
    pickle.dump(TrainSamples, f)

In [49]:
def parse_valid_user(filename,news_index):
        
    df = pd.read_csv(filename, sep='\t')
    df.fillna(value=" ", inplace=True)
    
    df['ClicknewsID'] = df['ClicknewsID'].apply(lambda x: PadDoc(Doc2ID(x.split(),news_index)))
    
    df['pos']  = df['pos'].apply(lambda x: Doc2ID(x.split(),news_index))
    df['neg'] = df['neg'].apply(lambda x: Doc2ID(x.split(),news_index))
    
    pos_neg_lists = []
    for userindex, (pos_list, neg_list) in enumerate(zip(df['pos'].values.tolist(), df['neg'].values.tolist())):
        if len(pos_list) and len(neg_list):
            pos_neg_lists.append([userindex, pos_list+neg_list,[1]*len(pos_list)+[0]*len(neg_list)])
        
    return df['ClicknewsID'].values.tolist(), pos_neg_lists

In [50]:
%time ValidUsers, ValidSamples = parse_valid_user(validfilename,news_index)

Wall time: 12.4 s


In [51]:
with open('../../data2/ValidUsers.pkl', 'wb') as f:
    pickle.dump(ValidUsers, f)
with open('../../data2/ValidSamples.pkl', 'wb') as f:
    pickle.dump(ValidSamples, f)

In [52]:
def parse_test_user(filename,news_index):
        
    df = pd.read_csv(filename, sep='\t')
    
    df['clicknewsID'] = df['clicknewsID'].apply(lambda x: PadDoc(Doc2ID(x.split(','),news_index)))
    
    df['posnewID']  = df['posnewID'].apply(lambda x: Doc2ID(x.split(','),news_index))
    
    df['rewrite_titles'] = df['rewrite_titles'].apply(lambda x: [i.lower() for i in x.split(';;')] )
    
    pos_lists = []
    for userindex, (pos_lis, rewrite_title_lis) in enumerate(zip(df['posnewID'].values.tolist(), df['rewrite_titles'].values.tolist())):
        for pos, rewrite_title in zip(pos_lis, rewrite_title_lis):
            if rewrite_title.strip() == '':
                continue
            else:
                pos_lists.append([userindex, pos, rewrite_title])
    
    return df['clicknewsID'].values.tolist(), pos_lists

In [53]:
%time TestUsers, TestSamples = parse_test_user(testfilename,news_index)

Wall time: 60.8 ms


In [54]:
with open('../../data2/TestUsers.pkl', 'wb') as f:
    pickle.dump(TestUsers, f)
with open('../../data2/TestSamples.pkl', 'wb') as f:
    pickle.dump(TestSamples, f)

In [55]:
TestSamples[0]

[0,
 14111,
 "legal battle looms over trump epa's rule change of obama's clean power plan rule"]