In [1]:
import scipy
import pandas as pd
import numpy as np
import math
import pymongo
import MySQLdb as sql
import _mysql
import random
import csv
import time
import re
import matplotlib.pyplot as plt; import matplotlib.pylab as pylab
#%matplotlib inline
pd.options.display.mpl_style = 'default'
pylab.rcParams['figure.figsize'] = 12, 6
from dateutil import parser
import Quandl
from pymongo import MongoClient
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import urllib2

## Get A BeautifulSoup Object

In [2]:
class BloombergSearch:
    def __init__(self, search_term):
        self.search_term = search_term
        self.url_page1 = ('http://www.bloomberg.com/search?query=' + str(self.search_term))

    def get_search_soup(self):
        url =  self.url_page1
        soup = self.get_soup(url)
        return soup
    def get_soup(self, url):
        page = urllib2.urlopen(url).read()
        soup = BeautifulSoup(page)
        return soup
    def get_search_page_links(self, num_pages):
        article_list = []
        for i in range(1, num_pages + 1):
            temp_soup = self.get_soup(self.url_page1 + str('&page=') + str(i))
            for result in temp_soup.find_all('h1'):
                try:
                    if 'video' in result.a['href']:
                        continue
                    if 'http' in result.a['href']:
                        #print item.a['href']
                        article_list.append(result.a['href'])
                    else:
                        #print 'http://www.bloomberg.com/' + item.a['href']
                        article_list.append('http://www.bloomberg.com/' + result.a['href'])
                except:
                    continue
            #print 'Added page=' + str(i)
        return article_list
    
    def get_post_body(self, article_soup):
        final_text = ""
        query = article_soup.find_all('div',  class_="article-body__content")
        for item in query:
            for text in item.find_all('p'):
                final_text = final_text + '\n\n' + str(text.text.encode('utf-8'))
        if final_text == "":
            return None
        return final_text
    
    def get_post_date(self, article_soup):
        final_text = ""
        result = article_soup.find('time', class_ = "published-at")
        try:
            return result['datetime']
        except:
            return None
    
    def get_post_author(self, article_soup):
        final_text = ""
        result = article_soup.find('a', class_ = "author-link")
        try:
            return result.text.lstrip().rstrip()
        except:
            return None
    
    def get_post_title(self, article_soup):
        final_text = ""
        result = article_soup.find('title')
        try:
            return result.text.lstrip().rstrip()
        except:
            return None
    
    def make_info(self, pages = 1):
        final_df = pd.DataFrame()
        for url in self.get_search_page_links(pages):
            temp_soup = self.get_soup(url)
            body = self.get_post_body(temp_soup)
            title = self.get_post_title(temp_soup)
            author = self.get_post_author(temp_soup)
            date = self.get_post_date(temp_soup)
            temp_series = pd.Series([title, url, author, date, body])
            final_df= final_df.append(temp_series, ignore_index = True)
            #print 'Added article, ' + str(len(final_df))
        final_df.columns = ['title', 'url', 'author', 'date', 'text']
        return final_df
    


In [3]:
class ReutersSearch:
    def __init__(self, search_term):
        self.search_term = search_term
        self.url_page1 = ('http://www.reuters.com/search/news?blob=' + str(self.search_term))

    def get_search_soup(self):
        url =  self.url_page1
        soup = self.get_soup(url)
        return soup
    def get_soup(self, url):
        page = urllib2.urlopen(url).read()
        soup = BeautifulSoup(page)
        return soup
    def get_search_page_links(self, num_pages = 1):
        #Does not have a next page link, only a
        #'LOAD MORE RESULTS' element
        article_list = []
        temp_page_url = self.url_page1
        temp_soup = self.get_soup(temp_page_url)
        query = temp_soup.find_all('h3', class_ = 'search-result-title')
        for result in query:
            try:
                if 'video' in result.a['href']:
                    print 'video on ' + str(i)
                    continue
                if 'http' in result.a['href']:
                    article_list.append(result.a['href'])
                else:
                    article_list.append('http://www.reuters.com/' + result.a['href'])
            except:
                continue
        return article_list
    
    def get_post_body(self, article_soup):
        final_text = ""
        query = article_soup.find_all('div',  id_ = "storytext")
        for item in query:
            for text in item.find_all('p'):
                final_text = final_text + '\n\n' + str(text.text.encode('utf-8'))
        if final_text == "":
            return None
        return final_text
    
    def get_post_date(self, article_soup):
        final_text = ""
        result = article_soup.find('span', class_="cnnDateStamp")
        try:
            return result.text
        except:
            return None
    
    def get_post_author(self, article_soup):
        final_text = ""
        result = article_soup.find('span', class_ = 'byline')
        try:
            return result.text.split('by')[1].lstrip().rstrip()
        except:
            return None
    
    def get_post_title(self, article_soup):
        final_text = ""
        result = article_soup.find('h1', class_ = 'article_title')
        try:
            return result.text.lstrip().rstrip()
        except:
            return None
    
    def make_info(self, pages = 1):
        final_df = pd.DataFrame()
        for url in self.get_search_page_links(pages):
            temp_soup = self.get_soup(url)
            body = self.get_post_body(temp_soup)
            title = self.get_post_title(temp_soup)
            author = self.get_post_author(temp_soup)
            date = self.get_post_date(temp_soup)
            temp_series = pd.Series([title, url, author, date, body])
            final_df= final_df.append(temp_series, ignore_index = True)
            #print 'Added article, ' + str(len(final_df))
        final_df.columns = ['title', 'url', 'author', 'date', 'text']
        return final_df

In [4]:
CNN = ReutersSearch('apple')

In [247]:
temp = CNN.get_search_page_links(1)

In [248]:
temp

['http://blogs.reuters.com/alison-frankel/2016/02/23/in-unsealed-filing-doj-claims-apple-misleading-on-decryption-opposition/',
 'http://www.reuters.com/article/idUSL2N1630LS',
 'http://www.reuters.com/article/idUSL2N163090',
 'http://www.reuters.com/article/idUSKCN0VX159',
 'http://www.reuters.com/article/idUSL2N1622IH',
 'http://www.reuters.com/article/idUSL8N16252Z',
 'http://www.reuters.com/article/idUSKCN0VW0BM',
 'http://www.reuters.com/article/idUSL3N162580',
 'http://www.reuters.com/article/idUSL2N1620IJ',
 'http://www.reuters.com/article/idUSL3N1613U4']