In [40]:
import requests,bs4,string,csv,traceback,os
import pandas as pd
import re 
from datetime import datetime, timedelta
import logging
import logging.config


def scrape_financial_times():
    config=pd.read_csv('../Config/config.cfg',sep='=',index_col=0)
    logging.config.fileConfig(fname='../Config/logging.conf', disable_existing_loggers=False)
    lastExtractDate=datetime.strptime(config.Value['FinancialTimes_LastDatePulled'],"%d/%m/%Y")
    logger=logging.getLogger('scrape_financial_times')
    logger.info('Started Scrapping the data')
    data_file=config.Value['NewsDataDir']+'/financial_times.csv'
    csv.register_dialect('myDialect',delimiter = '|',escapechar='\\',lineterminator = '\n',quoting=csv.QUOTE_NONE,skipinitialspace=True)
    pageno=1
    condition=True
    while condition:
        if pageno==1:
            weblink='https://www.ft.com/oil'
        else:
            weblink='https://www.ft.com/oil?page='+str(pageno)
        fhand=requests.get(weblink)
#         print(fhand.read())
        if (fhand.text=='Maximum page number exceeded'):
            break
        soup=bs4.BeautifulSoup(fhand.text,'html.parser')
        li_lst=soup.findAll('li',{'class':'o-teaser-collection__item o-grid-row'})
        if li_lst is None:
            li_lst=soup.findAll('li',{'class':'o-teaser-collection__item o-grid-row '})
        else:
            li_lst.extend(soup.findAll('li',{'class':'o-teaser-collection__item o-grid-row '}))
        with open(data_file, 'a') as f:
#             logger.info('Opened file '+data_file)
            writer = csv.writer(f, dialect='myDialect')
            for li in li_lst:
                try:
                    if li and li.find('time',{'data-o-component':'o-date'}):
                        date_object = datetime.strptime(li.find('time',{'data-o-component':'o-date'}).text, "%A, %d %B, %Y")
                        if date_object<=lastExtractDate:
                            condition=False
                            break
                        writer.writerow([date_object.strftime("%d/%m/%Y"),li.find('div',{'class':'o-teaser__heading'}).text.encode('utf8','ignore').decode('utf-8', 'replace')])
                except AttributeError as ae:
                    logger.exception('Error:'+str(ae)+'|Link:'+weblink+'Tag:'+str(li))
                except UnicodeEncodeError as ue:
                    logger.exception('Error:'+str(ue)+'|Link:'+weblink+'Tag:'+str(li))
                except:
                    logger.exception('Error:|Link:'+weblink+'Tag:'+str(li))
        f.close()
        fhand.close()
        logger.info('Scrapping Completed for '+weblink)
        pageno=pageno+1
    config.Value['FinancialTimes_LastDatePulled']=datetime.strftime(datetime.now()- timedelta(1),"%d/%m/%Y")
    config.to_csv('../Config/config.cfg',sep='=')
    logger.info('FinancialTimes:completed Successfully')

def scrape_oilprice():
    config=pd.read_csv('../Config/config.cfg',sep='=',index_col=0)
    logging.config.fileConfig(fname='../Config/logging.conf', disable_existing_loggers=False)
    lastExtractDate=datetime.strptime(config.Value['OilPrice_LastDatePulled'],"%d/%m/%Y")
    logger=logging.getLogger('scrape_oilprice')
    logger.info('Started Scrapping the data')
    data_file=config.Value['NewsDataDir']+'/oilprice.csv'
    csv.register_dialect('myDialect',delimiter = '|',escapechar='\\',lineterminator = '\n',quoting=csv.QUOTE_NONE,skipinitialspace=True)
    pageno=1
    condition=True
    while condition:
        if pageno==1:
            weblink='https://oilprice.com/Energy/Crude-Oil/'
        else:
            weblink='https://oilprice.com/Energy/Crude-Oil/Page-'+str(pageno)+".html"
        fhand=requests.get(weblink)
        soup=bs4.BeautifulSoup(fhand.text,'html.parser')
        article_lst=soup.findAll('div',{'class':'categoryArticle__content'})
        with open(data_file, 'a') as f:
#             logger.info('Opened file '+data_file)
            writer = csv.writer(f, dialect='myDialect')
            for article in article_lst:
                try:
                    if article:
                        date_object = datetime.strptime(article.find('p',{'class':'categoryArticle__meta'}).text.split('at')[0].strip(), "%b %d, %Y")
                        if date_object<=lastExtractDate:
                            condition=False
                            break
                        writer.writerow([date_object.strftime("%d/%m/%Y"),article.find('h2',{'class':'categoryArticle__title'}).text.strip().encode('utf8','ignore').decode('utf-8', 'replace')])
                except AttributeError as ae:
                    logger.exception('Error:'+str(ae)+'|Link:'+weblink+'Tag:'+str(li))
                except UnicodeEncodeError as ue:
                    logger.exception('Error:'+str(ue)+'|Link:'+weblink+'Tag:'+str(li))
                except:
                    logger.exception('Error:|Link:'+weblink+'Tag:'+str(li))
        f.close()
        fhand.close()
        logger.info('Scrapping Completed for '+weblink)
        pageno=pageno+1
        max_page_no=int(soup.find('span',{'class':'num_pages'}).text.strip('/'))
        if (pageno>max_page_no):
            break
    config.Value['OilPrice_LastDatePulled']=datetime.strftime(datetime.now()- timedelta(1),"%d/%m/%Y")
    config.to_csv('../Config/config.cfg',sep='=')
    logger.info('OilPrice:completed Successfully')
    
def scrape_moneycontrol():
    config=pd.read_csv('../Config/config.cfg',sep='=',index_col=0)
    logging.config.fileConfig(fname='../Config/logging.conf', disable_existing_loggers=False)
    lastExtractDate=datetime.strptime(config.Value['MoneyControl_LastDatePulled'],"%d/%m/%Y")
    logger=logging.getLogger('scrape_moneycontrol')
    logger.info('Started Scrapping the data')
    data_file=config.Value['NewsDataDir']+'/moneycontrol.csv'
    csv.register_dialect('myDialect',delimiter = '|',escapechar='\\',lineterminator = '\n',quoting=csv.QUOTE_NONE,skipinitialspace=True)
    pageno=1
    condition=True
    while condition:
        if pageno==1:
            weblink='https://www.moneycontrol.com/news/commodities-news-94.html/'
        else:
            weblink='https://www.moneycontrol.com/news/commodities-news-94.html/page-'+str(pageno)+'/'
        fhand=requests.get(weblink)
        soup=bs4.BeautifulSoup(fhand.text,'html.parser')
        article_lst=soup.find('ul', {'id': 'cagetory'}).find_all('li', {'id':re.compile('newslist-\d+')})
        with open(data_file, 'a') as f:
#             logger.info('Opened file '+data_file)
            writer = csv.writer(f, dialect='myDialect')
            for article in article_lst:
                try:
                    if article:
                        if re.search('oil|crude|petrol|diesel',article.find('a').get('title').lower()):
                            date_object = datetime.strptime(' '.join(article.find('span').text.split()[:3]).strip(), "%B %d, %Y")
                            if date_object<=lastExtractDate:
                                condition=False
                                break
                            writer.writerow([date_object.strftime("%d/%m/%Y"),article.find('a').get('title').strip().encode('utf8','ignore').decode('utf-8', 'replace')])
                        else:
                            logger.debug('No Crude oile news found')
                except AttributeError as ae:
                    logger.exception('Error:'+str(ae)+'|Link:'+weblink+'Tag:'+str(li))
                except UnicodeEncodeError as ue:
                    logger.exception('Error:'+str(ue)+'|Link:'+weblink+'Tag:'+str(li))
                except:
                    logger.exception('Error:|Link:'+weblink+'Tag:'+str(li))
        f.close()
        fhand.close()
        logger.info('Scrapping Completed for '+weblink)
        pageno=pageno+1
        for last_anchor in soup.find('div', {'class': 'pagenation'}).find_all('a', {'class':'last'}):
            if last_anchor.text.strip(' »')=='Last':
                max_page_no=int(last_anchor['data-page'])
                break
        if (pageno>max_page_no):
            break
    config.Value['MoneyControl_LastDatePulled']=datetime.strftime(datetime.now()- timedelta(1),"%d/%m/%Y")
    config.to_csv('../Config/config.cfg',sep='=')
    logger.info('MoneyControl:completed Successfully')
def main():
    try:
        scrape_financial_times()
    except:
        logging.exception('Error While scriping Financial Times')
    try:
        scrape_oilprice()
    except:
        logging.exception('Error While scriping Oil price')
    try:
        scrape_moneycontrol()
    except:
        logging.exception('Error While scriping MoneyControl')
logging.shutdown()

In [136]:
main()

In [104]:
import pandas as pd
from datetime import datetime
to_datetime = lambda d: datetime.strptime(d, "%d/%m/%Y")

In [65]:
data=pd.read_csv('../Data/NewsData/financial_times.csv',sep='|',names=['Date', 'HeadLines'],converters={'Date': to_datetime},encoding = "ISO-8859-1")
word_count_file=pd.read_csv()

In [113]:
data['Agg_Col']=data.Date.apply(lambda dt:datetime.strftime(get_relevance_date(dt),"%d/%m/%Y"))

In [135]:
data.groupby('Agg_Col').sum().head(29)

Unnamed: 0_level_0,HeadLines
Agg_Col,Unnamed: 1_level_1
01/02/2016,Premier Oil urges action on North SeaS&P cuts ...
01/02/2017,Oil market waits for Opec-shale tug of war to ...
01/02/2018,The remarkable revival of US oil productionBuo...
01/02/2019,Russias support for Venezuela has deep rootsT...
01/03/2016,Shell faces test of environmental recordWhat p...
01/03/2017,Exxon chief forecasts higher production growth...
01/03/2018,We need to talk about Igor: the rise of Russ...
01/03/2019,Sultan Al Jaber: changing the mindset of an ol...
01/04/2016,Japans biggest bank to grow fixed incomeTrump...
01/04/2019,Saudi Aramco leaves questions for potential in...


In [None]:
file=open('../Data/NewsData/financial_times.csv','r')

In [None]:
for line in file.readlines():
    print(line.split('|'))
file.close()

In [26]:
x='Iran\xe2\x80\x99s deepening malaise laid'
print(x.encode('utf8').decode('utf8'))

Iranâs deepening malaise laid
