In [2]:
import requests
from time import sleep
from datetime import date
import pandas as pd

In [3]:
newsrooms = pd.read_csv('../data/raw/newsrooms.csv')
newsrooms.columns = ['site', 'monthly_visits', 'country', 'country_of_pub']
newsrooms.dropna(inplace=True)
newsrooms = newsrooms.drop_duplicates(subset = 'site')
newsrooms = newsrooms.sort_values(by=['country_of_pub', 'monthly_visits'], ascending=False).reset_index().drop('index', axis=1)

In [4]:
UK = newsrooms[newsrooms['country_of_pub']=='UK'].reset_index().drop('index', axis=1)
USA = newsrooms[newsrooms['country_of_pub']=='USA'].reset_index().drop('index', axis=1)
India = newsrooms[newsrooms['country_of_pub']=='India'].reset_index().drop('index', axis=1)
SA = newsrooms[newsrooms['country_of_pub']=='South Africa'].reset_index().drop('index', axis=1)

In [5]:
all_sites = list(SA.site.unique().flatten()) + list(USA.site.unique().flatten()) + list(UK.site.unique().flatten()) + list(India.site.unique().flatten())

In [6]:
len(all_sites)

203

In [7]:
len(UK) + len(USA) + len(India) + len(SA)

203

In [8]:
year = 2020

def get_three_month_ranges(year):
    month_ranges = [[[1, 1, year], [31, 1, year]], 
                    [[1, 2, year], [28, 2, year]],
                    [[1, 3, year], [31, 3, year]],
                    [[1, 4, year], [30, 4, year]],
                    [[1, 5, year], [31, 5, year]],
                    [[1, 6, year], [30, 6, year]],
                    [[1, 7, year], [31, 7, year]],
                    [[1, 8, year], [31, 8, year]],
                    [[1, 9, year], [30, 9, year]],
                    [[1, 10, year], [31, 10, year]],
                    [[1, 11, year], [30, 11, year]],
                    [[1, 12, year], [31, 12, year]]]

    two_month_ranges = []
    for i in range(len(month_ranges)-1):
        if i%2==0:
            two_month_ranges.append((month_ranges[i][0], month_ranges[i+1][1]))

    three_month_ranges = []
    for i in range(len(month_ranges)-1):
        if i%3==0:
            three_month_ranges.append((month_ranges[i][0], month_ranges[i+2][1]))

    return three_month_ranges

In [9]:
month_ranges = []
for i in range(2010,2022):
    month_ranges = month_ranges + get_three_month_ranges(i)

In [40]:
month_ranges[45:46]

[([1, 4, 2021], [30, 6, 2021])]

In [36]:
[all_sites.index('vogue.in')]

[193]

In [37]:
all_sites[193:203]

['vogue.in',
 'outlookindia.com',
 'mensxp.com',
 'deccanchronicle.com',
 'sumanasa.com',
 'swarajyamag.com',
 'greaterkashmir.com',
 'thebetterindia.com',
 'freepressjournal.in',
 'economictimes.indiatimes.com']

In [19]:
len(month_ranges)

48

In [18]:

for i in month_ranges[0:1]:
    print(i)
    from_date = f'{i[0][2]}-{i[0][1]}-{i[0][0]}'
    to_date = f'{i[1][2]}-{i[1][1]}-{i[1][0]}'
    year = i[0][2]
    #bimester = i[1][1]/2
    quarter = i[1][1]/3
    print(quarter)
    

([1, 1, 2010], [31, 3, 2010])
1.0


In [41]:
%%time

from tqdm import tqdm
headlines_dict = {}

sites = all_sites[194:203]
# sites = list(USA['site'].unique().flatten())[1:]

url = "https://google-news.p.rapidapi.com/v1/source_search"
# url = "https://google-news1.p.rapidapi.com/search"

request_count = 0
for i in tqdm(month_ranges[45:46]):
    from_date = f'{i[0][2]}-{i[0][1]}-{i[0][0]}'
    to_date = f'{i[1][2]}-{i[1][1]}-{i[1][0]}'
    year = i[0][2]
    #bimester = i[1][1]/2
    quarter = i[1][1]/3

    for site in sites:

        print("Scraping" ,site, "starting from", from_date, "to", to_date)

        urls = []
        headlines = []
        times = []
        scrape_dates = []
        websites = []

        # baseline news

        querystring = { "lang":"en","from":from_date,"to":to_date,"source":site}

        # women's news

        # querystring = {"q": "women OR woman OR girl OR female OR lady OR ladies OR she OR her OR herself OR aunt OR grandmother OR mother OR sister", 
                      # "lang":"en","from":from_date,"to":to_date,"source":site}

        headers = {

            # Free API
            'x-rapidapi-key': "bd5010b7eemsh00ab5684fa6a7efp15b791jsnd00e2d098829",
            'x-rapidapi-host': "google-news.p.rapidapi.com"
            }

        response = requests.request("GET", url, headers=headers, params=querystring).json()

        request_count += 1

        for j in range(len(response['articles'])):
            headlines.append(response['articles'][j]['title'])
            times.append(response['articles'][j]['published'])
            urls.append(response['articles'][j]['link'])
            websites.append(response['articles'][j]['source']['href'])

        headlines_dict = {"url": urls, "headline": headlines, "time": times, "scrape_date": date.today().strftime("%d/%m/%Y"), "site": site}

        headlines_df = pd.DataFrame.from_dict(headlines_dict)
        number_of_articles = len(headlines_df)
        # save results to csv
        if number_of_articles!=0:
            
            headlines_df.to_csv(f'../data/raw_baseline/{number_of_articles}_{site}_{quarter}_{year}_articles.csv')
        # print final statement
            print("Saved data from " + str(number_of_articles) + " articles of "  + str(site) + " to .csv!")
        else:
            print("No data from " + str(site) + " during this timeframe!")
            
        print(request_count, "requests fired")


  0%|          | 0/1 [00:00<?, ?it/s]Scraping outlookindia.com starting from 2021-4-1 to 2021-6-30
Saved data from 100 articles of outlookindia.com to .csv!
1 requests fired
Scraping mensxp.com starting from 2021-4-1 to 2021-6-30
Saved data from 100 articles of mensxp.com to .csv!
2 requests fired
Scraping deccanchronicle.com starting from 2021-4-1 to 2021-6-30
Saved data from 100 articles of deccanchronicle.com to .csv!
3 requests fired
Scraping sumanasa.com starting from 2021-4-1 to 2021-6-30
No data from sumanasa.com during this timeframe!
4 requests fired
Scraping swarajyamag.com starting from 2021-4-1 to 2021-6-30
Saved data from 100 articles of swarajyamag.com to .csv!
5 requests fired
Scraping greaterkashmir.com starting from 2021-4-1 to 2021-6-30
Saved data from 100 articles of greaterkashmir.com to .csv!
6 requests fired
Scraping thebetterindia.com starting from 2021-4-1 to 2021-6-30
Saved data from 100 articles of thebetterindia.com to .csv!
7 requests fired
Scraping freepres

In [154]:
# response['articles']

In [156]:
to_date

'2005-2-28'

In [18]:
url = "https://google-news.p.rapidapi.com/v1/source_search"

querystring = {"q": "women OR woman OR girl OR female OR lady OR ladies OR she OR her OR herself OR wife OR mom OR mother OR sister", 
                       "lang":"en","from":"2005-1-1","to":"2005-2-28","source":"economictimes.indiatimes.com"}

# "women OR woman OR girl OR female OR lady OR ladies OR she OR her OR herself OR aunt OR grandmother OR mother OR sister OR daughter OR wife OR mom OR mum OR girlfriend OR mrs OR niece"        
headers = {
    'x-rapidapi-key': "bd5010b7eemsh00ab5684fa6a7efp15b791jsnd00e2d098829",
    'x-rapidapi-host': "google-news.p.rapidapi.com"
    }

response = requests.request("GET", url, headers=headers, params=querystring).json()

# print(response.text)