#Sentence extractor for New York Times

In [101]:
import math
import os
from time import sleep

import numpy as np
import pandas as pd
import requests
import ujson as json
from joblib import Parallel, delayed

##Search terms

In [46]:
st_file = open('search_terms.txt')
search_terms = map(lambda x: x.strip(), st_file.readlines())

##NYTimes API keys

In [87]:
# One API key for each of the cores
api_keys = [
    "3439a9084efa80c4f5fb1d290dfc1b44:11:70233981", # my api key
    "a5c709f3168b829711241b243457e9d6:13:70235641",
    "ba47374fd391c9bc5fd3ca51ff953a44:14:70229228",
    "4557e02788189abb3642a33bca7469ff:11:69136863",
    "87d7b22c0feec4f3112d80b71d0b500a:1:69642501",
    "2b3d39fd4c7836168a2a370c25ad6232:16:70235576",
    "d7655429355ab2df4621a10c01d04865:8:69135199",
#     "730e30f5220059551e666430644fbf87:11:69642501",  # Inactive
    "1944df13b86dd83e4a8c4ea82e767975:2:65092848",
]

In [96]:
def next_multiple(n, m):
    # 4, 17 ==> 20
    rest = m % n
    return m if rest == 0 else m + n - rest

def chunks(l, n_chunks):
    size = len(l)
    n = next_multiple(n_chunks, size) // n_chunks
    for i in range(0, len(l), n):
        yield l[i:i+n]

In [88]:
search_terms_by_api_key = {}
for t in zip(api_keys, chunks(list(search_terms), len(api_keys))):
    search_terms_by_api_key[t[0]] = t[1]

##Downloader

In [None]:
def format_query(term):
    return term

In [None]:
def process_response(response):
    return response

In [None]:
def search(q, start_date, end_date, sort, page, api_key):
    base_url = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'
    payload = {'q': q, 'begin_date': start_date, 'end_date': end_date, 'sort': sort, 'page': page, 'api_key': api_key}
    response = requests.get(base_url, params=payload)
    return response.json()

In [None]:
def get_documents(term, begin_date='19990101', end_date='20141231', sort='oldest', page=0, api_key='sample-key'):
    q = format_query(term)
    response = search(q, start_date, end_date, sort, page, api_key)
    
    if response['status'] != 'OK':
        documents = []
        print('Error')
    
    total_results = response['response']['meta']['hits']
    if total_results == 0:
        documents = []
        print('No results found.')
    elif total_results <= 10:
        documents = response['response']['docs']
        print('>>>', len(docs), 'results found.')
    elif total_results <= 1010:
        documents = []
        n_pages = math.ceil(total_results / 10)
        for page in range(n_pages):
            docs = get_documents(term, begin_date, end_date, sort, page, api_key)
            documents.append(docs)
    else: # total_results > 1010
        docs = []
        for page in range(n_pages):
            docs = get_documents(term, begin_date, end_date, sort, page, api_key)
            documents.append(docs)
    
    # no sera una lista, sino un Dataframe
    # eliminar posibles duplicados
    return documents

In [None]:
def download_documents_for_term(term, begin_date='19990101', end_date='20141231', sort='oldest', page=0, api_key='sample-key'):
    q = format_query(term)
    response = search(q, start_date, end_date, sort, page, api_key)
    
    if response['status'] != 'OK':
        pass
    
    total_results = response['response']['meta']['hits']
    if total_results == 0:
        docs = []
        print('No results found.')
    elif total_results <= 10:
        docs = response['response']['docs']
        print('>>>', len(docs), 'results found.')
    elif total_results <= 1010:
        n_pages = math.ceil(total_results / 10)
        docs = 
        for page in range(n_pages)
    elif total_results <= 10:
        pass

In [None]:
def download_documents(api_key, terms):
    for term in terms:
        download_documents_for_term(api_key, term)

In [None]:
def downloader(api_keys, search_terms_by_api_key):
    Parallel(n_jobs=8)(delayed(download_documents)(api_key, search_terms_by_api_key[api_key]) for api_key in api_keys)
#     for api_key in api_keys:
#         download_terms(api_key, search_terms_by_api_key[api_key])

In [None]:
downloader(api_keys, search_terms_by_api_key):

In [92]:
len(list(search_terms))

421

In [35]:
from urllib.request import urlopen
URL='http://api.nytimes.com/svc/search/v2/articlesearch.json?q=new+startups&begin_date=20130101&end_date=20130201&sort=newest&api-key=sample-key'
r=urlopen(URL)

In [36]:
import ujson as json
d=json.load(r)

In [38]:
d.keys()

dict_keys(['status', 'response', 'copyright'])

In [None]:
#########
#
# NY Times - API requests automated script
#
# CulturePlex lab.
#
# @versae, @mavillard
#
# This cell contains unified in the same cell the downloading files and the article treatment.
#
#########

import io
import os
import pandas as pd
import ujson as json
from dateutil import parser
from newspaper import Article
from pandas.io.json import json_normalize
from time import sleep
from urllib.request import urlopen

# API Keys

# api_key = '730e30f5220059551e666430644fbf87:11:69642501'
# api_key = 'd7655429355ab2df4621a10c01d04865:8:69135199'
api_key = '3439a9084efa80c4f5fb1d290dfc1b44:11:70233981'

# We need to take into account the tag list used for the requests

tag_lists = [['startup']]

# This variable is used to create the appropiate path for the articles of the files

# current_tags = ''

# Dates for the requests

begin_date = '20130101'
# end_date = '20131231'
end_date = '20130201'

# Url for the requests and the paths to save the files

base_url = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'
entries_url = base_url + '?q={q}&sort={sort}&begin_date={begin_date}&end_date={end_date}&api-key={api_key}&page={page}'

data_path = os.path.join('data', 'nytimes_files')
json_path = os.path.join(data_path, 'json_original')
csv_path = os.path.join(data_path, 'csv_url')
txt_path = os.path.join(data_path, 'txt_article')

# Urls and variables needed for the Article extraction

txt_article_temp_data_path = ''

data_total = pd.DataFrame()

wanted_columns = ['_id', 'web_url']
new_column = 'url_works'

change_columns_name = {'_id': 'id', 'web_url': 'url'}
index_column = 'id'

ordered_columns = ['url_works', 'url']

##
# get_url_works(row, tags)
#
# Function that checks if the article url works and, in that case, download and treat
# the article text.
##

def get_url_works(row, attempts=3):
    works = 0
    url = row.url
    
    tag_dir = '_'.join(tags)
    full_data_path = os.join.path(txt_path, tag_dir)
    if not os.path.exists(tag_path):
        os.makedirs(tag_path)
    
    if url.startswith('http://www.nytimes.com/') or url.startswith('https://www.nytimes.com/'):
        url += '?pagewanted=all'
    
    while attempts > 0:
        try:
            a = Article(url, fetch_images=False, memorize_articles=False)
            a.download()
            a.parse()
            
            if a.is_valid_body():
                works = 1
                
                path = os.path.join(tag_path, row.id + '.txt')
                with io.open(path, 'w') as outfile:
                    outfile.write(a.url)
                    outfile.write(u'\n\n')
                    outfile.write(a.title)
                    outfile.write(u'\n\n')
                    outfile.write(a.text)
            
            attempts = 0
        
        except:
            attempts -= 1
    
    return pd.Series({new_column: works})

# We create the csv path and txt article path if they don't exist.

if not os.path.exists(csv_path):
    os.makedirs(csv_path)
    
if not os.path.exists(txt_path):
    os.makedirs(txt_path)

# This is the main loop to request to the NY Times API for every tags in the tags list

for tags in tag_lists:
    
    # Initialization variables
    
    perform_requests = True
    first_request = True
    
    iteration = 0
    page = 0
    offset = 0
    hits = 0
    start_date = begin_date
    
    # We prepare the necessary paths
    
    tag_dir = '_'.join(tags)
    full_data_path = os.join.path(txt_path, tag_dir)
    print full_data_path
    if not os.path.exists(full_data_path):
        os.makedirs(full_data_path)
    
    # And now, we start with the requests
    
    while perform_requests:
        q = '+'.join(tags)
        request_url = entries_url.format(q=q, begin_date=start_date, end_date=end_date, api_key=api_key, page=page)
        print request_url
        response = urlopen(request_url)
        sleep(0.1)  # Max. 10 request per second
        
        # Right now, we load the reponse in the data variable. This variable contains the JSON result.
        
        data = json.load(response)
        
        if (data['status'] != 'OK'):
            perform_requests = False
            print('\tFinishing with errors for tags:', q)
            print('\tPrinting the response...')
            print('\t\t' + response)
            break
        
        if (len(data['response']['docs']) == 0):
            perform_requests = False
            print '\tFinishing, tags:', '_'.join(tags)
            break
        
        # We need to control the page, because the pagination ends when reach to 100.
        # So, we need to change the begin date to start a new pagination from 0.
        
        page += 1
        if page > 100:
            last_index = len(data['response']['docs']) - 1
            
            last_date = parser.parse(data['response']['docs'][last_index]['pub_date'])
            begin_date_aux = str(last_date.year) + '%02d' % last_date.month + '%02d' % last_date.day

            page = 0
            iteration += 1
        
        # We save the content in the JSON file.
        
        filename = '_'.join(tags) + '_' + str((iteration * 100) + page + iteration) + '.json'
        print '\tSaving file', filename
        
        with open(full_data_path + '/' + filename, 'w') as outfile:            
            json.dump(data, outfile)
        
        # Once the file is saved, we focus on the article extraction.
        
        data_aux = json_normalize(data['response'], 'docs')[wanted_columns]
            
        data_aux.rename(columns=change_columns_name, inplace=True)
        data_aux[new_column] = 0

        url_works = data_aux.apply(get_url_works, axis=1)
        data_aux.update(url_works)

        if first_request:
            data_total = data_aux.copy()
            first_request = False
        else:
            data_total = data_total.append(data_aux)

        data_aux = None

    # And we finalize saving the csv files that links for every folder.    
    
    print '\tTreating the big dataFrame'
    
    print '\tShape before remove bad rows:', data_total.shape
    filter_empty_url = (data_total.url != '')  # If doing this after saving to CSV, instead of that you need this: data.text.notnull()
    data_total = data_total[(filter_empty_url)].copy()
    print '\tShape after remove bad rows:', data_total.shape
        
    print '\tShape before remove duplicated rows:', data_total.shape
    data_total.drop_duplicates(['id'], inplace=True)
    print '\tShape after remove duplicated rows:', data_total.shape
    
    data_total.set_index('id', inplace=True)
    
    print '\tSaving to CSV'
    
    data_total = data_total.reindex_axis(ordered_columns, axis=1)
    folder = '_'.join(tags)
    data_total.to_csv(csv_url_data_path + '/' + folder + '.csv', encoding='utf-8')
    
    data_total = None

print 'Finished!'

In [None]:
#########
#
# NY Times - API requests automated script
#
# CulturePlex lab.
#
# @versae, @josemazo and @gabmunrio
#
# This cell contains all the logic to get the readability, diversity and sentiment from the texts.
#
#########

import datetime
import os

import numpy as np
import pandas as pd
import ujson as json

from pandas.io.json import json_normalize
from pattern.metrics import readability
from pattern.metrics import ttr as diversity
from pattern.en import sentiment

# Necessary paths to save and access to the files

json_data_path = 'data/files_new_york_times/json_original/'
article_data_path = 'data/files_new_york_times/txt_article/'

csv_data_path = 'data/files_new_york_times/'
text_data_path = 'data/files_new_york_times/data/'

# Some variables for a correct behaviour

exist_article_column = 'exist_article'
article_text_column = 'text'
text_file_path_column = 'file'

no_company_topics = ['entrepreneur', 'startup', 'new_venture', 'manager', 'executive', 'founder']
media = 'nyt'
is_company = 0
search_words = []
dir_name = ''

first = True
data = None

# Columns to reorder the different csv files

sentiment_columns = ['readability', 'diversity', 'polarity', 'subjetivity']

change_columns_name = {'web_url': 'url'}

ordered_columns = ['media', 'search_words', 'is_company', 'pub_date',
                   'readability', 'diversity', 'polarity', 'subjetivity',
                   'text', 'url', 'file']

ordered_no_text_columns = ['media', 'search_words', 'is_company', 'pub_date',
                           'readability', 'diversity', 'polarity', 'subjetivity',
                           'url', 'file']

##
# check_article_and_proceed(row)
#
# Function that checks if the article exists and proceed to treat it it is. In the treatment,
# we calculate the values for readability, diversity and sentiment.
#
##

def check_article_and_proceed(row):
    file_name = article_data_path + dir_name + '/' + row._id + '.txt' 
    exist = os.path.isfile(file_name)
    
    save_file_name = ''
    text = ''
    
    read = 0
    diver = 0
    pol = 0
    sub = 0
    
    if exist:
        if is_company:
            company_topic = 'company'
        else:
            company_topic = 'topic'
            
        dt = datetime.datetime.strptime(row.pub_date, '%Y-%m-%dT%H:%M:%SZ')
        dt = dt.strftime('%Y%m%dT%H%M%S')
        
        save_file_name = '_'.join([media, company_topic, search_words[0], dt, row._id]) + '.txt'
        
        with open(file_name, 'r') as file_in:
            file_lines = file_in.read().splitlines(True)
        
        with open(text_data_path + save_file_name, 'w') as file_out:
            file_out.writelines(file_lines[4:])
        
        with open(text_data_path + save_file_name, 'r') as file_in:
            text = file_in.read()
        
        try:
            read = readability(text)
        except:
            read = 0
        
        try:
            diver = diversity(text)
        except:
            diver = 0
            
        try:
            sent = sentiment(text)
        except:
            sent = (0, 0)
        
        pol = sent[0]
        sub = sent[1]
        
    return pd.Series({
        exist_article_column: exist,
        article_text_column: text,
        text_file_path_column: 'data/' + save_file_name,
        sentiment_columns[0]: read,
        sentiment_columns[1]: diver,
        sentiment_columns[2]: pol,
        sentiment_columns[3]: sub
    })

# We create the text_data_path if it does not exist.

if not os.path.exists(text_data_path):
    os.makedirs(text_data_path)

# Main loop to create the final csv file. We are going to have two csv files, one with the
# associated text of every row, and another without that text.
    
for dir_name in os.walk(json_data_path).next()[1]:
    json_topic_path = json_data_path + dir_name + '/'
    
    if dir_name in no_company_topics:
        is_company = 0
    else:
        is_company = 1
    
    search_words = dir_name.split('_')
    
    for file_name in os.walk(json_topic_path).next()[2]:
        json_topic_file = json_topic_path + file_name
        
        json_file = json.load(open(json_topic_file))
        data_aux = json_normalize(json_file['response'], 'docs')
        
        data_aux.rename(columns=change_columns_name, inplace=True)
        
        data_aux[exist_article_column] = False
        data_aux[article_text_column] = ''
        data_aux[text_file_path_column] = ''
        data_aux[sentiment_columns[0]] = 0
        data_aux[sentiment_columns[1]] = 0
        data_aux[sentiment_columns[2]] = 0
        data_aux[sentiment_columns[3]] = 0
        
        ordered_columns = ['media', 'search_words', 'is_company', 'pub_date',
                   'readability', 'diversity', 'polarity', 'subjetivity',
                   'text', 'url', 'file']
        
        data_aux[ordered_columns[0]] = media
        data_aux[ordered_columns[1]] = ' '.join(search_words)
        data_aux[ordered_columns[2]] = is_company
        
        data_exist = data_aux[['_id', 'pub_date']].apply(check_article_and_proceed, axis=1)
        data_aux.update(data_exist)
        
        exist_filter = (data_aux[exist_article_column] == True)
        data_aux = data_aux[exist_filter]
        
        empty_text_filter = (data_aux[article_text_column] != '')
        data_aux = data_aux[empty_text_filter]
        
        data_aux = data_aux[ordered_columns]
        data_aux = data_aux.reindex_axis(ordered_columns, axis=1)
        
        if first:
            data = data_aux.copy()
            first = False
        else:
            data = data.append(data_aux, ignore_index=True)

        data_aux = None
    
    print 'Finished with ' + dir_name
    
data.to_csv(csv_data_path + 'new_york_times' + '.csv', index=False, encoding='utf-8')

data[ordered_no_text_columns].to_csv(csv_data_path + 'new_york_times_no_text' + '.csv', index=False, encoding='utf-8')

print 'FINISHED!'

In [None]:
pd.read_csv('data/files_new_york_times/new_york_times_no_text.csv', encoding='utf-8')

In [None]:
dataFrame = pd.read_csv('data/files_new_york_times/csv_url/startup.csv', encoding='utf-8')
count = 0
for result in dataFrame['url_works']:
    if result == 1:
        count = count + 1
print count