In [1]:
import requests
import requests_ftp
import requests_cache
import lxml
import itertools
import pandas as pd
import re
import numpy as np
import string
from bs4 import BeautifulSoup
from collections import Counter
from matplotlib import pyplot as plt
from wordcloud import WordCloud
from scipy.misc import imread
plt.style.use('ggplot')
requests_cache.install_cache('../bloomberg')

In [71]:
def motley_page_links(page):
    
    response = requests.get(
        'https://www.fool.com/search/solr.aspx?page={}&q=apple&sort=date&source=isesitbut0000001'.format(page))
    response.raise_for_status()
    html = response.text
    parsed_html = BeautifulSoup(html, 'lxml')

    div_with_links = parsed_html.find_all(name = 'dl',
                                         attrs = {'class' : 'results'})
    links = []
    for link in div_with_links[0].find_all('a', href = True):
        links.append(link['href'])
    
    return links

In [72]:
def motley_all_links(no_pages = 1):
    """
    Given number of pages, it returns all the links 
    from "no_pages"
    
    Input: number of pages (default = 1)
    Output: a list with links from the pages
    """
    all_links = []
    for page in range(1, (no_pages + 1)):
        all_links.extend(motley_page_links(page))
    
    return all_links

In [73]:
def motley_article_info(url):
    
    response = requests.get(url)
    response.raise_for_status()
    html = response.text
    parsed_html = BeautifulSoup(html, 'lxml')
    content = parsed_html.find_all(name = 'div',
                                      attrs = {'class' : 'full_article'})

    date = parsed_html.find_all(name = 'div', attrs = {'class' : 'publication-date'})[0].text.strip()
    title = parsed_html.find_all('h1')[0].text
    article = ' '.join([t.text for t in content[0].find_all('p')])
    
    return {'title'   : title,
            'date'    : date,
            'article' : article,
            'url'     : url}

In [77]:
def motley_df(no_pages):
    """
    Creates DataFrame for the articles in url
    with author, text, title, and url as column
    names.
    
    Input: A url, number of pages
    Output: DataFrame with 4 columns: author,
    text, title, and url.
    """
    
    #get all links in the specified number of pages
    #from url
    links = motley_all_links(no_pages)
    
    #create dataframe for each link and
    #combine them into one dataframe
    article_df = pd.DataFrame(index = [999999], columns=['article', 'date', 'title', 'url'])
    for i, link in enumerate(links):
        try:
            append_to = pd.DataFrame(motley_article_info(link), index = [i])
            article_df = article_df.append(append_to)
        except:
            pass
    
    article_df = article_df.drop(999999)
    return article_df

In [79]:
#df = motley_df(28)

In [82]:
#convert_to_csv(df, "motleyfool.csv")

check current directory for motleyfool.csv


In [149]:
def get_page_links(page):
    """
    Given a page number, it returns all article links.
    
    Input: a page number (default = 1)
    Output: a list with links on the given page
    """
    
    response = requests.get(
        'https://www.bloomberg.com/search?query=apple&endTime=2017-06-02T14:25:01.383Z&page={}'.format(page))
    response.raise_for_status()
    html = response.text
    parsed_html = BeautifulSoup(html, 'lxml')
    div_with_links = parsed_html.find_all(name = 'article', 
                                          attrs = {'class' : 'search-result-story type-article'})
    links = []
    for tag in div_with_links:
        try:
            links.append(tag.find_all('a', href = True)[1]['href'])
        except:
            pass
    
    return links

In [46]:
def get_all_links(no_pages = 1):
    """
    Given number of pages, it returns all the links 
    from "no_pages"
    
    Input: number of pages (default = 1)
    Output: a list with links from the pages
    """
    all_links = []
    for page in range(1, (no_pages + 1)):
        all_links.extend(get_page_links(page))
    
    return all_links

In [137]:
def get_article_info(url):
    """
    Given an article url, it returns title, date, content
    and url of that article.
    
    Input: article url
    Ouput: a dictionary with 'title', 'date',
    'article', and 'url' as keys.
    """
    response = requests.get(url)
    response.raise_for_status()
    html = response.text
    parsed_html = BeautifulSoup(html, 'lxml')

    content = parsed_html.find_all(name = 'div',
                                  attrs = {'class' : 'transporter-item current'})

    content_text = content[0].find_all(name = 'div', attrs = {'class' : 'body-copy'})

    article = ' '.join([t.text for t in content_text[0].find_all(name = 'p')])

    title = content[0].find_all(name = 'h1', attrs = {'class' : 'lede-text-only__hed'})[0].text

    date = content[0].find_all(name = 'time', attrs = {'class' : 'article-timestamp'})[0].text[-30:]
    
    return {'title'   : title,
            'date'    : date,
            'article' : article,
            'url'     : url}

In [138]:
def create_df(no_pages):
    """
    Creates DataFrame for the articles in url
    with author, text, title, and url as column
    names.
    
    Input: A url, number of pages
    Output: DataFrame with 4 columns: author,
    text, title, and url.
    """
    
    #get all links in the specified number of pages
    #from url
    links = get_all_links(no_pages)
    
    #create dataframe for each link and
    #combine them into one dataframe
    article_df = pd.DataFrame(index = [999999], columns=['article', 'date', 'title', 'url'])
    for i, link in enumerate(links):
        try:
            append_to = pd.DataFrame(get_article_info(link), index = [i])
            article_df = article_df.append(append_to)
        except:
            pass
    
    article_df = article_df.drop(999999)
    return article_df

In [81]:
def convert_to_csv(df, name):
    df.to_csv(name, index=False, encoding='utf-8')
    print 'check current directory for {}'.format(name)

In [150]:
#df = create_df(33)

In [151]:
#convert_to_csv(df)

check current directory for "bloomberg.csv"


In [155]:
print df['date'].values

[u'   \nJune 1, 2017, 8:24 PM EDT\n' u'   \nJune 1, 2017, 6:00 AM EDT\n'
 u'   \nMay 31, 2017, 3:21 PM EDT\n' u'  \nJune 1, 2017, 10:00 AM EDT\n'
 u'   \nJune 1, 2017, 6:00 PM EDT\n' u'   \nJune 1, 2017, 4:02 PM EDT\n'
 u'   \nJune 1, 2017, 5:55 AM EDT\n' u'   \nJune 1, 2017, 6:00 AM EDT\n'
 u'   \nMay 31, 2017, 6:00 AM EDT\n' u'   \nJune 1, 2017, 6:02 AM EDT\n'
 u'  \nMay 31, 2017, 12:04 PM EDT\n' u'   \nJune 1, 2017, 7:44 PM EDT\n'
 u'   \nMay 31, 2017, 6:00 AM EDT\n' u'   \nJune 1, 2017, 4:41 PM EDT\n'
 u'   \nMay 31, 2017, 4:20 PM EDT\n' u'   \nJune 1, 2017, 4:14 AM EDT\n'
 u'   \nJune 2, 2017, 9:27 AM EDT\n' u'   \nJune 1, 2017, 2:32 PM EDT\n'
 u'   \nJune 1, 2017, 3:50 PM EDT\n' u'   \nMay 31, 2017, 7:47 AM EDT\n'
 u'  \nJune 1, 2017, 12:01 AM EDT\n' u'   \nJune 1, 2017, 4:30 PM EDT\n'
 u'   \nJune 1, 2017, 7:12 AM EDT\n' u'  \nMay 30, 2017, 12:13 PM EDT\n'
 u'   \nMay 30, 2017, 8:24 PM EDT\n' u'   \nMay 31, 2017, 9:02 AM EDT\n'
 u'   \nMay 31, 2017, 1:58 PM EDT\n' u'   \nMay 31,