### Problem Statement

Build a python script to fetch headlines from: https://www.hindustantimes.com/
Save them in a csv file


In [1]:
"""
    Important modules
"""

from bs4 import BeautifulSoup # For html parsing
from requests import get # For retrieving web page
import pandas as pd # To save into csv

In [2]:
# Sending header with page request to mimic a user behaviour
headers = ({'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})

### 1. Check robots.txt
Robots.txt specifies whether we can scrape a web page or not

In [3]:
def check_allowance(response):
    """
        Goes line by line through robots.txt file. Checks if contains 'User-agent: *' and 'Disallow: /'.
        This would mean disallowing everything to user agent, so it would be wrong to scrape homepage.
    """
    lines = response.text.split('\n')

    i = 0
    
    while i < len(lines):
        if 'User-agent: *' in lines[i]:
            i += 1
            
            while 'User-agent:' not in lines[i]:
                if lines[i] == 'Disallow: /':
                    return False
                i += 1
        i += 1
    return True

In [4]:
url = 'https://www.hindustantimes.com/'

In [5]:
robot_response = get(url + 'robots.txt', headers=headers)
print("Status for scraping " + url + " : " + str(check_allowance(robot_response)))

Status for scraping https://www.hindustantimes.com/ : True


### 2. Get the webpage

In [6]:
# Get the html code of homepage
response = get(url, headers=headers)

# Check the response status
print(response.status_code)

200


In [7]:
if response.status_code == 200:
    # If positive response, view part of html code
    print(response.text[:1000])

<!DOCTYPE html>
<html lang="en">
<head>
<!-- don't make any changes here -->
<!-- don't make any changes end here -->
<title>Hindustan Times: Latest News, Breaking News and Today’s News Headlines | Hindustan Times</title>
	<meta name="Description" content="Hindustan Times is India’s No.1 English News website where users can find Latest News, Breaking News, Today’s News Headlines, Trending News and updates from India and the World. Also watch latest photos and videos based on current affairs." />

<meta name="keywords" content="news, breaking news, Union budget news, latest news, news today, daily news, english news, election news, news updates, headlines, India news, politics news, sports news, entertainment news, business news, live cricket score, cricket news, education news" />
<!-- section pages updated /PortalConfig/Common/jpt/meta/meta-page.jpt-->
	<script type="application/ld+json">
				{ 
 					"@context" : "https://schema.org", 
 					"@type": "WebPage", 
 					"

### 3. Parse the page for content

"""
    List of html tags and classes that contain headlines:
    - div.newtop-block h1
    - div.newtop-block h2
    - div.media-conversations-section div.new-assembly-elections div.headingfour
    - div.media-conversations-section div.new-video-news div.headingfour
    - section.post-election-container div.headingfour
    - section.post-election-container div.headingfive
    - div.editor-pick-section div.headingfour
    - div.ipl-hm-container div.headingfour
    - div.ipl-hm-container div.headingfive
    - div.photos-section div.headingfive
    - div.random-news-section h3
    - div.random-news-section div.para-txt
    - div.opinion-section div.headingfour
    - div.opinion-section div.headingfive
"""

In [8]:
def brute_get_all_headlines(response):
    """
        Gets headlines using hard-coded parsing calls
    """
    
    html_soup = BeautifulSoup(response.text, 'html.parser')
    
    out_list = []
    
    # The top block
    newtop_block = html_soup.find_all('div', class_='newtop-block')[0]
    
    out_list.extend([item.text.strip() for item in newtop_block.find_all('h1') if item.text != ''])
    out_list.extend([item.text.strip() for item in newtop_block.find_all('h2') if item.text != ''])
    
    # Media conversations block
    media_convo = html_soup.find_all('div', class_='media-conversations-section')[0]
    new_ass = media_convo.find_all('div', class_='new-assembly-elections')[0]
    new_vid = media_convo.find_all('div', class_='new-video-news')[0]
    
    out_list.extend([item.text.strip() for item in new_ass.find_all('div', class_='headingfour') if item.text != ''])
    out_list.extend([item.text.strip() for item in new_vid.find_all('div', class_='headingfour') if item.text != ''])
    
    # Post election block
    post_ele = html_soup.find_all('section', class_='post-election-container')[0]
    
    out_list.extend([item.text.strip() for item in post_ele.find_all('div', class_='headingfour') if item.text != ''])
    out_list.extend([item.text.strip() for item in post_ele.find_all('div', class_='headingfive') if item.text != ''])
    
    # Editors pick
    edit_pick = html_soup.find_all('div', class_='editor-pick-section')[0]
    
    out_list.extend([item.text.strip() for item in edit_pick.find_all('div', class_='headingfour') if item.text != ''])
    
    # Assembly elections
    hm_cont = html_soup.find_all('div', class_='ipl-hm-container')[0]
    
    out_list.extend([item.text.strip() for item in hm_cont.find_all('div', class_='headingfour') if item.text != ''])
    out_list.extend([item.text.strip() for item in hm_cont.find_all('div', class_='headingfive') if item.text != ''])
    
    # Photos section
    photo_sect = html_soup.find_all('div', class_='photos-section')[0]
    
    out_list.extend([item.text.strip() for item in photo_sect.find_all('headingfive') if item.text != ''])
    
    # Random news
    rand_news = html_soup.find_all('div', class_='random-news-section')[0]
    
    out_list.extend([item.text.strip() for item in rand_news.find_all('h3') if item.text != ''])
    out_list.extend([item.text.strip() for item in rand_news.find_all('div', class_='para-txt') if item.text != ''])
    
    # Opinion section
    op_sect = html_soup.find_all('div', class_='opinion-section')[0]
    
    out_list.extend([item.text.strip() for item in op_sect.find_all('div', class_='headingfour') if item.text != ''])
    out_list.extend([item.text.strip() for item in op_sect.find_all('div', class_='headingfive') if item.text != ''])
    
    # Remove any duplicate items
    out_list = list(set(out_list))
    
    # Create python dataframe
    series = pd.Series(out_list, index=range(len(out_list)), name='Headlines')
    return series

In [9]:
elements_to_parse = [
    'div.newtop-block h1',
    'div.newtop-block h2',
    'div.media-conversations-section div.new-assembly-elections div.headingfour',
    'div.media-conversations-section div.new-video-news div.headingfour',
    'section.post-election-container div.headingfour',
    'section.post-election-container div.headingfive',
    'div.editor-pick-section div.headingfour',
    'div.ipl-hm-container div.headingfour',
    'div.ipl-hm-container div.headingfive',
    'div.photos-section div.headingfive',
    'div.random-news-section h3',
    'div.random-news-section div.para-txt',
    'div.opinion-section div.headingfour',
    'div.opinion-section div.headingfive'
]

def looped_get_all_headlines(response, elements_to_parse):
    """
        Gets headlines using looped parsing calls.
        Inputs:
            - response: The returned value of a requests.get() call
            - elements_to_parse: A list of html tags in the form 'tag.class inner-tag.class' that need to be parsed
        
        Output: Pandas Series object containing all headlines
    """
    
    # Parser
    html_soup = BeautifulSoup(response.text, 'html.parser')
    
    out_list = []
    
    for element in elements_to_parse:
        tag_hierarchy = element.split(' ') # Get the individual tags in the hierarchy
        
        html_block = html_soup # Make a duplicate of parser
        
        for tag_and_class in tag_hierarchy[:-1]:
            # Keep pointing the parser to the next element in hierarchy until the second-last element
            
            try:
                tag, class_ = tag_and_class.split('.')
            except:
                # In case there is no class for a tag
                tag, class_ = tag_and_class, ''
                
            try:
                # Find the first occurance of the tag
                html_block = html_block.find_all(tag, class_=class_)[0]
            except:
                break
        
        # Get the last element in tag hierarchy
        try:
            tag, class_ = tag_hierarchy[-1].split('.')
        except:
            tag, class_ = tag_hierarchy[-1], ''
        
        # Add the text contained in each element to the output list
        out_list.extend([item.text.strip() for item in html_block.find_all(tag, class_=class_) if item.text != ''])
    
    # Remove duplicate elements
    out_list = list(set(out_list))
    
    # Convert to Series object, sort and reindex
    series = pd.Series(out_list, name='Headlines')
    series = series.sort_values()
    series.index = range(len(series))
    return series

### 4. View and save headlines

In [10]:
headlines = looped_get_all_headlines(response, elements_to_parse)

In [11]:
# View top of list
headlines.head()

0    10 yrs since 26/11, gaps still exist in securi...
1    11,000 new polling booths in MP, highest among...
2    2018 assembly polls fight against casteism, dy...
3    25 killed in suicide bomb attack in market in ...
4       26/11 case: Bizarre trial and a saga of delays
Name: Headlines, dtype: object

In [12]:
# Save to file
headlines.to_csv('HTHeadlines.csv')