In [101]:
import ast
import spacy


from bs4 import BeautifulSoup
import pickle
from multiprocessing import Pool, Manager
import time
import pandas as pd
import numpy as np
import re, sys

import csv

import tqdm # optional
import spacy


In [121]:

DATA_DIR = '/home/user/Dropbox'
input_articles_csv_path = DATA_DIR+'/Data/input_articles.tsv'

def token_rules(doc):
    for tok in doc:
        if (tok.is_stop == False) and (tok.is_punct == False) and (tok.pos_ != 'NUM')  and tok.text != ' ':
            tok.tag_ = u'exclude'
        elif (tok.pos_ == 'NUM' and tok.ent_type_ == 'DATE'):
            tok.text = u'date'

def custom_pipeline(nlp):
    return (nlp.tagger, token_rules, nlp.parser)


def load_data(path, test_frac=0.3, n_articles=None):
    sent_tokens_list = []
    nlp = spacy.load('en', create_pipeline=custom_pipeline)
    return pd.DataFrame(
        iter_file_sent_tokens(
            path=path,
            nlp=nlp,
            n_articles=n_articles,
        )
    )


def iter_file_sent_tokens(path, nlp, n_articles=None):
    """Iterate over the review file and yield sentence tokens.
    
    :param path: Path to the reviews file.
    :type path: str
    :param nlp: Spacy tokenizer with parsing.
    :type nlp: spacy.en.English
    :param n_reviews: Maximum number of reviews to iter over. If
        None we iter over all reviews.
    :type n_reviews: int or NoneType
    
    :returns: Iterator over a dict. See sample output.
    :rtype: iter(dict)
    
    Sample output::
    
        {'review_id': 'iamid', 'sent_num': 0, 'tokens': ['Every', 'villain', 'is', 'lemons']}
    """

    for article in \
        nlp.pipe(
            iter_article_text(path=path, n_articles=n_articles),
            batch_size=2500,
            n_threads=4,
        ):
        for sent_num, sent in enumerate(article.sents):
            yield {
                'sent_num': sent_num,
                'tokens': sent
            }


def iter_article_dict(path, n_articles=None):
    """Iterate over the review file loading the json data into a dict."""
    with open(path, "rt", encoding='utf-8') as infile:
        reader = csv.DictReader(infile, delimiter='\t')
        for i, line in enumerate(reader):
            if n_articles and n_articles == i:
                return
            yield line


def iter_article_text(path, n_articles=None):
    """Iterate over the review text in the review file."""
    for article in iter_article_dict(path=path, n_articles=n_articles):
        html_content = article['content']
#         summary = article['summary']
        title = [article['title']]

        soup = BeautifulSoup(html_content, 'html.parser')
        pis = soup.findAll('p')

        # turn p tags to list of strings
        content = []
        for match in pis:
            text = match.get_text()
            content.append(text)

        # combine all texts to one list of strings for parsing
        content = title + content
        text = '.'.join(content).replace('$','dollar ').replace('%',' percent')
        yield text
        
df = load_data(
    path=input_articles_csv_path,
    test_frac=0,
    n_articles=100,
)
print (len(df.index))
print (df.head(2))

2779
   sent_num                                             tokens
0         0  (Today, 's, Top, Idea, :, Long, Startek, (, SR...
1         1  (Long, SRT, (, Startek, ), by, Omar, A., Samal...


In [111]:
c=0
with open(input_articles_csv_path, "rt", encoding='utf-8') as infile:
    reader = csv.DictReader(infile, delimiter='\t')
    for row in reader:
        print('\r{}'.format(row))
        c += 1
        if c == 10:
            break

OrderedDict([('article_id', '4189694'), ('publisher_id', ',105512,'), ('category_id', ',17933,'), ('created_at_ts', '2018-07-25 05:00:00'), ('words_count', '182'), ('title', "Today's Top Idea: Long Startek (SRT)"), ('summary', 'NULL'), ('content', '<p><strong>PRO+ Top Idea</strong></p> <p><a href="http://seekingalpha.com/article/4189690-startek-aegis-merger-complete-indiscriminate-selling-due-russell-deletion-leaves-shares">Long SRT (Startek)</a> by Omar A. Samalot; market cap $271M</p> <ul> <li>Business process outsourcer Startek trades at a distressed valuation due to indiscriminate selling following removal from the Russell 2000; given the post closing market cap, rejoining the index next June is almost certain.</li> <li>With the merger complete, Startek/Aegis is a globally diversified powerhouse with &gt;$700M in revenue and a much lower risk profile.</li> <li>Strategic relationship with Amazon is an overlooked catalyst and could drive significant revenue growth over the next year

In [85]:
reader = pd.read_csv(input_articles_csv_path, encoding = 'utf-8', sep='\t', iterator=True, nrows=10, chunksize=1)

row = reader.get_chunk(1)
print(row['article_id'])

for row in reader:
    print(row['article_id'])

0    4189694
Name: article_id, dtype: int64
1    4190093
Name: article_id, dtype: int64
2    4190856
Name: article_id, dtype: int64
3    4191092
Name: article_id, dtype: int64
4    4191188
Name: article_id, dtype: int64
5    4191692
Name: article_id, dtype: int64
6    4192142
Name: article_id, dtype: int64
7    4192430
Name: article_id, dtype: int64
8    4192717
Name: article_id, dtype: int64
9    4193328
Name: article_id, dtype: int64


In [57]:
def token_rules(doc):
    for tok in doc:
        if (tok.is_stop == False) and (tok.is_punct == False) and (tok.pos_ != 'NUM')  and tok.text != ' ':
            tok.tag_ = u'exclude'
        elif (tok.pos_ == 'NUM' and tok.ent_type_ == 'DATE'):
            tok.text = u'date'

def custom_pipeline(nlp):
    return (nlp.tagger, token_rules, nlp.parser)

nlp = spacy.load('en', create_pipeline=custom_pipeline)

def parsePipeSents(doc):
    
    sentences = list()

    html_content = doc[0]
    summary = doc[1]
    title = [doc[2]]

    soup = BeautifulSoup(html_content, 'html.parser')
    pis = soup.findAll('p')

    # turn p tags to list of strings
    content = []
    for match in pis:
        text = match.get_text()
        content.append(text)

    # combine all texts to one list of strings for parsing
    content = title + summary + content
    text = '.'.join(content).replace('$','dollar ').replace('%',' percent')
    
    parsed = nlp.pipe(text, batch_size=100, n_threads=4)

    return ' '.join(parsed.sents)

def loadInputCSV(path):
    news_df = pd.read_csv(path, encoding = 'utf-8', sep='\t', nrows = 100)

    content = news_df['content'].apply(nan_to_str).tolist()
    summary = [ast.literal_eval(x) for x in news_df['summary'].apply(nan_to_list)]
    news_df['created_at_ts'] = pd.to_datetime(news_df['created_at_ts']).astype(np.int64) // 10 ** 9
    
    sentences = [parsePipeSents(doc) for doc in zip(content, summary, news_df.title.tolist())]

    print ("\nParesed {} sentences".format(len(sentences)))

    assert len(sentences) == len(news_df.index)
    #Concatenating all available text
    news_df['full_text'] = np.asarray(sentences)

    return news_df


In [57]:
def deserialize(filename):
    with open(filename, 'rb') as handle:
#     with tf.gfile.Open(filename, 'rb') as handle:
        return pickle.load(handle)

def parseSents(args):
    sentences = list()
    doc, q = args

    html_content = doc[0]
    summary = doc[1]
    title = [doc[2]]

    soup = BeautifulSoup(html_content, 'html.parser')
    pis = soup.findAll('p')

    # turn p tags to list of strings
    content = []
    for match in pis:
        text = match.get_text()
        content.append(text)

    # combine all texts to one list of strings for parsing
    content = title + summary + content
    text = '.'.join(content).replace('$','dollar ').replace('%',' percent')
    
    parsed = nlp(text)
    for sent in parsed.sents:
        current_sen = []
        for tok in sent:
            if (tok.is_stop == False) and (tok.is_punct == False) and (tok.pos_ != 'NUM')  and tok.text != ' ':
                current_sen.append(tok.lemma_.lower())
            elif (tok.pos_ == 'NUM' and tok.ent_type_ == 'DATE'):
                current_sen.append('date')

        sentences.append(current_sen)

    flat_sentences = [item for sublist in sentences for item in sublist]

    return ' '.join(flat_sentences)


def nan_to_list(value):
    return '[]' if type(value) == float else value

def nan_to_cat(value):
    return ',-1,' if type(value) == float else value

def nan_to_str(value):
    return '' if type(value) == float else value

def load_input_csv(path):
    news_df = pd.read_csv(path, encoding = 'utf-8', sep='\t', nrows = 100)

    content = news_df['content'].apply(nan_to_str).tolist()
    summary = [ast.literal_eval(x) for x in news_df['summary'].apply(nan_to_list)]
    news_df['created_at_ts'] = pd.to_datetime(news_df['created_at_ts']).astype(np.int64) // 10 ** 9

    t0 = time.time()
    p = Pool()
    m = Manager()
    q = m.Queue()

    args = [(i, q) for idx, i in enumerate(zip(content, summary, news_df.title.tolist()))]

    result = p.map_async(parseSents, args, chunksize=1)

    while not result.ready():
        remaining = result._number_left * result._chunksize
        t = time.time() - t0
        sys.stderr.write('\rRemaining: {0:} Elapsed: {1:7.3f}'.format(remaining,  t))
        sys.stderr.flush()
        time.sleep(1)

    sentences = result.get()

    print ("\nParesed {} sentences".format(len(sentences)))

    assert len(sentences) == len(news_df.index)
    #Concatenating all available text
    news_df['full_text'] = np.asarray(sentences)
    # news_df['full_text'] = (news_df['title'].apply(nan_to_str) + ". " + \
    #                         news_df['summary'].apply(nan_to_str) + ". " + \
    #                         news_df['content'].apply(nan_to_str)
    #                    ).apply(clean_and_filter_first_sentences)

    return news_df

In [74]:
DATA_DIR = '/home/user/Dropbox'
input_articles_csv_path = DATA_DIR+'/Data/input_articles.tsv'
input_word_embeddings_path = DATA_DIR+'/articles_word2vec/w2v_model'

df = load_input_csv(input_articles_csv_path)

Remaining: 5 Elapsed:  21.16647


Paresed 100 sentences


Process ForkPoolWorker-21:
Process ForkPoolWorker-19:
Process ForkPoolWorker-25:
Process ForkPoolWorker-24:
Process ForkPoolWorker-20:
Process ForkPoolWorker-26:
Traceback (most recent call last):
Process ForkPoolWorker-22:
Process ForkPoolWorker-23:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/user/anaconda3/envs/chameleon/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/user/anaconda3/envs/chameleon/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/user/anaconda3/envs/chameleon/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/user/anaconda3/envs/chameleon/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()


In [58]:
DATA_DIR = '/home/user/Dropbox'
input_articles_csv_path = DATA_DIR+'/Data/input_articles.tsv'
input_word_embeddings_path = DATA_DIR+'/articles_word2vec/w2v_model'

df = loadInputCSV(input_articles_csv_path)

AttributeError: 'generator' object has no attribute 'sents'

In [122]:
input_articles_csv_path = '/home/user/Dropbox/Data/processed_articles.pickle'

df = deserialize(input_articles_csv_path)

In [54]:
DATA_DIR = '/home/user/Dropbox'
input_articles_csv_path = DATA_DIR + '/pickles/acr_label_encoders.pickle'

acr_label_encoders = deserialize(input_articles_csv_path)

In [55]:
type(acr_label_encoders['publisher_id'])

sklearn.preprocessing.label.LabelEncoder

In [56]:
type(acr_label_encoders['category_id'])

sklearn.preprocessing.label.LabelEncoder

In [123]:
df.head()

Unnamed: 0,id_encoded,categoryid_encoded,publisherid_encoded,created_at_ts,text_length,text_int
0,0,476,32,1532494800,47,"[69, 1853, 3082, 631, 47, 14720, 20110, 1, 1, ..."
1,1,476,32,1532581200,49,"[69, 1853, 3082, 631, 47, 1222, 1, 6035, 1, 1,..."
2,2,490,32,1532667600,47,"[69, 1853, 3082, 631, 171, 572, 4042, 34059, 1..."
3,3,513,48,1532763000,3653,"[1977, 2528, 2744, 2152, 570, 1012, 1, 17339, ..."
4,4,513,32,1532849400,405,"[984, 640, 1091, 3845, 121, 6849, 11456, 51, 2..."


In [125]:
df.categoryid_encoded.describe()

count    6683.000000
mean      432.359270
std       116.092787
min         0.000000
25%       375.000000
50%       461.000000
75%       521.000000
max       540.000000
Name: categoryid_encoded, dtype: float64

In [124]:
df.categoryid_encoded.value_counts()

526    763
537    519
363    357
490    239
491    216
519    188
461    166
0      165
415    162
375    155
416    150
477    147
521    143
476    141
460    120
451    112
433    101
432     99
517     99
384     99
540     95
353     94
444     92
399     86
513     79
339     64
445     62
340     60
338     58
472     54
      ... 
187      1
259      1
191      1
501      1
527      1
195      1
482      1
7        1
443      1
167      1
389      1
35       1
31       1
365      1
283      1
279      1
377      1
381      1
275      1
393      1
267      1
397      1
27       1
23       1
409      1
19       1
15       1
271      1
425      1
171      1
Name: categoryid_encoded, Length: 541, dtype: int64

In [37]:
l = df.category_id.loc[1].split(',')
list(filter(None,l))

['17933']

In [40]:
def order_str(string):
    
    string = string.split(',')
    string = list(filter(None, string))
    string = sorted(string)
    return ','.join(string)

category = df.category_id.apply(nan_to_cat)
category = category.apply(order_str)

category.value_counts()

1936          19
17933         15
-1            12
17862         12
17994          9
17810          7
17896          7
17829          5
17913          5
17933,2227     2
17810,1936     1
2227           1
17862,2227     1
17994,1936     1
17862,1936     1
17913,1936     1
17896,1936     1
Name: category_id, dtype: int64