In [1]:
import os, json, time, re
import random, collections, cPickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cPickle as pickle

%matplotlib inline  

In [2]:
data_path = '/Users/matt.meng/Downloads'
file_name = 'small_articles.json'

In [8]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
    
def process_raw_data(data_path, file_name):
    start_time = time.time()
    with open(os.path.join(data_path, file_name), 'r') as input:
        counter = 0
        title_df = pd.DataFrame(columns=['title', 'pageView'])
        for line in input:
            json_doc = json.loads(line)
            #publisher_id = json_doc['pv_publisherId']
            expected_keys = ['pv_title', 'pv_url', 'pv_pageViews']
            if not all([key in json_doc.keys() for key in expected_keys]):
                continue
            title, url, pageView = json_doc['pv_title'], json_doc['pv_url'], json_doc['pv_pageViews']
            if not isEnglish(title):
                continue
            title_df.loc[url] = pd.Series({'title' : title, 'pageView' : pageView})
    title_df.index.name = 'url'
    print 'finished processing all the data using {:.2f} seconds'.format(time.time() - start_time) 
    return title_df

def basic_tokenizer(line, normalize_digits=True):
    line = line.replace("'s", '')
    line = re.sub(r"\'ve", " have ", line)
    line = re.sub(r"can't", "cannot ", line)
    line = re.sub(r"n't", " not ", line)
    line = re.sub(r"I'm", "I am", line)
    line = re.sub(r" m ", " am ", line)
    line = re.sub(r"\'re", " are ", line)
    line = re.sub(r"\'d", " would ", line)
    line = re.sub(r"\'ll", " will ", line)
    line = re.sub(r"\?", " ? ", line)
    line = re.sub(r"!", " ! ", line)
    line = re.sub(r":", " : ", line)
    
    line = re.sub('[,."#%\'()*+/;<=>@\[\]^_{|}~`’”“′‘\\\]', ' ', line)
    line = re.sub('[\n\t ]+', ' ', line)
    words = []
    #_WORD_SPLIT = re.compile(b"([.,!?\"'-<>:;)(])")
    _DIGIT_RE = re.compile(r"\d")
    for token in line.strip().lower().split():
        if not token:
            continue
        if normalize_digits:
            token = re.sub(_DIGIT_RE, b'#', token)
        words.append(token)
    return len(words), ' '.join(words)

_PAD = b"_PAD"
_GO = b"_GO"
_EOS = b"_EOS"
_UNK = b"_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

'''
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
'''

TOKEN_DICT = {}
REVERSE_TOKEN_DICT = {}
for i in xrange(len(_START_VOCAB)):
    TOKEN_DICT[_START_VOCAB[i]] = i
    REVERSE_TOKEN_DICT[i] = _START_VOCAB[i]


def create_vocab_dict(data, column_name, token_freq_threshold=5, UKN_frac_threshold=0.3):
    vocab_dict = {}
    all_titles = []
    selected_titles = []
    selected_title_urls = []
    selected_title_pageView = []
    bad_titles = []
    for title, url, pageView in zip(data[column_name], data.index, data['pageView']):
        words = []
        for token in title.split(' '):
            words.append(token)
            if token not in vocab_dict:
                vocab_dict[token] = 0
            vocab_dict[token] += 1
        all_titles.append((words, url, pageView))
    print 'total {} tokens are ideintified...'.format(len(vocab_dict))

    token_dict, reverse_token_dict = TOKEN_DICT.copy(), REVERSE_TOKEN_DICT.copy()
    UKN_index = len(token_dict) - 1
    unique_counts = 0
    sorted_pairs = sorted(vocab_dict.items(), key=lambda x : x[1], reverse=True)
    for i, pair in enumerate(sorted_pairs):
        if pair[1] >= token_freq_threshold:
            unique_counts += 1
            token_dict[pair[0]] = i + 1 + UKN_index
            reverse_token_dict[(i + 1 + UKN_index)] = pair[0]
        else:
            token_dict[pair[0]] = UKN_index
    print 'total {} unique tokens are included in the token dictionary...'.format(unique_counts)            
    
    for i in xrange(len(all_titles)):
        indexed_title = map(token_dict.get, all_titles[i][0])
        UKN_count = sum([elem == UKN_index for elem in indexed_title])
        if (1.*UKN_count/len(indexed_title)) < UKN_frac_threshold:
            selected_titles.append(indexed_title)
            selected_title_urls.append(all_titles[i][1])
            selected_title_pageView.append(all_titles[i][2])
        else:
            bad_titles.append((indexed_title, all_titles[i][1], all_titles[i][2]))
    print 'total {} titles are included...'.format(len(selected_titles))            
    return token_dict, reverse_token_dict, selected_titles, selected_title_urls, selected_title_pageView, bad_titles

In [4]:
title_df = process_raw_data(data_path, file_name)

print title_df.shape
title_df.head()

finished processing all the data using 119.83 seconds
(20913, 2)


Unnamed: 0_level_0,title,pageView
url,Unnamed: 1_level_1,Unnamed: 2_level_1
http://www.nydailynews.com/news/national/transgender-navy-seal-slams-trump-banning-servicemembers-article-1.3358836,Transgender Navy SEAL slams Trump for banning ...,410271
http://www.nydailynews.com/new-york/delta-pilot-flight-attendant-fight-delays-takeoff-laguardia-article-1.3352751,"Delta pilot, flight attendant fight delays tak...",234898
http://www.nydailynews.com/news/national/fbi-suspect-custody-cruise-ship-death-article-1.3361442,FBI has suspect in custody in cruise ship death,151344
http://www.nydailynews.com/life-style/u-s-navy-aircraft-carrier-sets-sail-no-urinals-article-1.3352063,New U.S. Navy aircraft carrier sets sail with ...,24271
https://www.cbssports.com/mlb/news/seven-bold-predictions-for-the-2017-mlb-trade-deadline/,Seven bold predictions for the 2017 MLB trade ...,100887


In [5]:
file_name = '/Users/matt.meng/Downloads/title_data.csv'
title_df.to_csv(file_name, index=True)
data = pd.read_csv(file_name, index_col='url')

In [6]:
print data.shape
data.head()

(20913, 2)


Unnamed: 0_level_0,title,pageView
url,Unnamed: 1_level_1,Unnamed: 2_level_1
http://www.nydailynews.com/news/national/transgender-navy-seal-slams-trump-banning-servicemembers-article-1.3358836,Transgender Navy SEAL slams Trump for banning ...,410271
http://www.nydailynews.com/new-york/delta-pilot-flight-attendant-fight-delays-takeoff-laguardia-article-1.3352751,"Delta pilot, flight attendant fight delays tak...",234898
http://www.nydailynews.com/news/national/fbi-suspect-custody-cruise-ship-death-article-1.3361442,FBI has suspect in custody in cruise ship death,151344
http://www.nydailynews.com/life-style/u-s-navy-aircraft-carrier-sets-sail-no-urinals-article-1.3352063,New U.S. Navy aircraft carrier sets sail with ...,24271
https://www.cbssports.com/mlb/news/seven-bold-predictions-for-the-2017-mlb-trade-deadline/,Seven bold predictions for the 2017 MLB trade ...,100887


In [7]:
# tokenize the titles
data['title_word_counts'], data['processed_title'] = zip(*data['title'].map(basic_tokenizer))
# sort by the title word counts and filter them
sorted_data = data.sort_values(by=['title_word_counts', 'pageView'], ascending=[True, False])
index = (sorted_data['title_word_counts'] >= 4) & (sorted_data['title_word_counts'] <= 15)
filtered_data = sorted_data.loc[index, :]

In [9]:
token_dict, reverse_token_dict, titles, selected_title_urls, selected_title_pageView, bad_titles = create_vocab_dict(filtered_data, 'processed_title')

total 22594 tokens are ideintified...
total 5595 unique tokens are included in the token dictionary...
total 17185 titles are included...


In [11]:
print len(titles), len(selected_title_urls), len(selected_title_pageView)

17185 17185 17185


In [12]:
#bad_titles

In [15]:
pickle_file = 'processed_title_data.pkl'

content = {'url' : selected_title_urls, 
           'titles' : titles, 
           'pageViw' : selected_title_pageView,
           'token_dict' : token_dict,
           'reverse_token_dict' : reverse_token_dict}

with open(pickle_file, 'wb') as handle:
    cPickle.dump(content, handle, protocol=cPickle.HIGHEST_PROTOCOL)