In [1]:
import requests
from time import sleep
from datetime import date
import pandas as pd

In [2]:
newsrooms = pd.read_csv('../data/raw/newsrooms.csv')
newsrooms.columns = ['site', 'monthly_visits', 'country', 'country_of_pub']
newsrooms.dropna(inplace=True)
newsrooms = newsrooms.drop_duplicates(subset = 'site')
newsrooms = newsrooms.sort_values(by=['country_of_pub', 'monthly_visits'], ascending=False).reset_index().drop('index', axis=1)

In [3]:
UK = newsrooms[newsrooms['country_of_pub']=='UK'].reset_index().drop('index', axis=1)
USA = newsrooms[newsrooms['country_of_pub']=='USA'].reset_index().drop('index', axis=1)
India = newsrooms[newsrooms['country_of_pub']=='India'].reset_index().drop('index', axis=1)
SA = newsrooms[newsrooms['country_of_pub']=='South Africa'].reset_index().drop('index', axis=1)

In [4]:
all_sites = list(SA.site.unique().flatten()) + list(USA.site.unique().flatten()) + list(UK.site.unique().flatten()) + list(India.site.unique().flatten())

In [5]:
len(all_sites)

203

In [6]:
len(UK) + len(USA) + len(India) + len(SA)

203

In [7]:
year = 2020

def get_two_month_ranges(year):
    month_ranges = [[[1, 1, year], [31, 1, year]], 
                    [[1, 2, year], [28, 2, year]],
                    [[1, 3, year], [31, 3, year]],
                    [[1, 4, year], [30, 4, year]],
                    [[1, 5, year], [31, 5, year]],
                    [[1, 6, year], [30, 6, year]],
                    [[1, 7, year], [31, 7, year]],
                    [[1, 8, year], [31, 8, year]],
                    [[1, 9, year], [30, 9, year]],
                    [[1, 10, year], [31, 10, year]],
                    [[1, 11, year], [30, 11, year]],
                    [[1, 12, year], [31, 12, year]]]

    two_month_ranges = []
    for i in range(len(month_ranges)-1):
        if i%2==0:
            two_month_ranges.append((month_ranges[i][0], month_ranges[i+1][1]))

    return two_month_ranges

In [8]:
month_ranges = []
for i in range(2020,2022):
    month_ranges = month_ranges + get_two_month_ranges(i)

In [9]:
import tqdm

In [10]:
month_ranges

[([1, 1, 2020], [28, 2, 2020]),
 ([1, 3, 2020], [30, 4, 2020]),
 ([1, 5, 2020], [30, 6, 2020]),
 ([1, 7, 2020], [31, 8, 2020]),
 ([1, 9, 2020], [31, 10, 2020]),
 ([1, 11, 2020], [31, 12, 2020]),
 ([1, 1, 2021], [28, 2, 2021]),
 ([1, 3, 2021], [30, 4, 2021]),
 ([1, 5, 2021], [30, 6, 2021]),
 ([1, 7, 2021], [31, 8, 2021]),
 ([1, 9, 2021], [31, 10, 2021]),
 ([1, 11, 2021], [31, 12, 2021])]

In [17]:
%%time

from tqdm import tqdm
headlines_dict = {}

sites = all_sites
# sites = list(USA['site'].unique().flatten())[1:]

url = "https://google-news.p.rapidapi.com/v1/source_search"
# url = "https://google-news1.p.rapidapi.com/search"

request_count = 0
for i in tqdm(month_ranges):
    from_date = f'{i[0][2]}-{i[0][1]}-{i[0][0]}'
    to_date = f'{i[1][2]}-{i[1][1]}-{i[1][0]}'
    year = i[0][2]
    bimester = i[1][1]/2

    for site in sites:

        print("Scraping" ,site, "starting from", from_date, "to", to_date)

        urls = []
        headlines = []
        times = []
        scrape_dates = []
        websites = []

        # baseline news

#         querystring = { "lang":"en","from":from_date,"to":to_date,"source":site}

        # women's news

        querystring = {"q": "women OR woman OR girl OR female OR lady OR ladies OR she OR her OR herself OR aunt OR grandmother OR mother OR sister", 
                       "lang":"en","from":from_date,"to":to_date,"source":site}

        headers = {

            # Free API
            'x-rapidapi-key': "bd5010b7eemsh00ab5684fa6a7efp15b791jsnd00e2d098829",
            'x-rapidapi-host': "google-news.p.rapidapi.com"
            }

        response = requests.request("GET", url, headers=headers, params=querystring).json()

        request_count += 1

        for j in range(len(response['articles'])):
            headlines.append(response['articles'][j]['title'])
            times.append(response['articles'][j]['published'])
            urls.append(response['articles'][j]['link'])
            websites.append(response['articles'][j]['source']['href'])

        headlines_dict = {"url": urls, "headline": headlines, "time": times, "scrape_date": date.today().strftime("%d/%m/%Y"), "site": site}

        headlines_df = pd.DataFrame.from_dict(headlines_dict)
        number_of_articles = len(headlines_df)
        # save results to csv
        if number_of_articles!=0:
            
            headlines_df.to_csv(f'../data/raw_temporal/{number_of_articles}_{site}_{bimester}_{year}_articles.csv')
        # print final statement
            print("Saved data from " + str(number_of_articles) + " articles of "  + str(site) + " to .csv!")
        else:
            print("No data from " + str(site) + " during this timeframe!")
            
        print(request_count, "requests fired")


  0%|                                                                                           | 0/12 [00:00<?, ?it/s]

Scraping News24.com starting from 2020-1-1 to 2020-2-28
Saved data from 100 articles of News24.com to .csv!
1 requests fired
Scraping Gumtree.co.za starting from 2020-1-1 to 2020-2-28
No data from Gumtree.co.za during this timeframe!
2 requests fired
Scraping Cars.co.za starting from 2020-1-1 to 2020-2-28
No data from Cars.co.za during this timeframe!
3 requests fired
Scraping BusinessTech.co.za starting from 2020-1-1 to 2020-2-28
Saved data from 1 articles of BusinessTech.co.za to .csv!
4 requests fired
Scraping Iol.co.za starting from 2020-1-1 to 2020-2-28
Saved data from 100 articles of Iol.co.za to .csv!
5 requests fired
Scraping Maroelamedia.co.za starting from 2020-1-1 to 2020-2-28
No data from Maroelamedia.co.za during this timeframe!
6 requests fired
Scraping Timeslive.co.za starting from 2020-1-1 to 2020-2-28
Saved data from 100 articles of Timeslive.co.za to .csv!
7 requests fired
Scraping Ewn.co.za starting from 2020-1-1 to 2020-2-28
Saved data from 59 articles of Ewn.co.za 

  8%|██████▋                                                                         | 1/12 [06:53<1:15:45, 413.21s/it]

Saved data from 64 articles of economictimes.indiatimes.com to .csv!
203 requests fired
Scraping News24.com starting from 2020-3-1 to 2020-4-30
Saved data from 92 articles of News24.com to .csv!
204 requests fired
Scraping Gumtree.co.za starting from 2020-3-1 to 2020-4-30
No data from Gumtree.co.za during this timeframe!
205 requests fired
Scraping Cars.co.za starting from 2020-3-1 to 2020-4-30
No data from Cars.co.za during this timeframe!
206 requests fired
Scraping BusinessTech.co.za starting from 2020-3-1 to 2020-4-30
No data from BusinessTech.co.za during this timeframe!
207 requests fired
Scraping Iol.co.za starting from 2020-3-1 to 2020-4-30
Saved data from 100 articles of Iol.co.za to .csv!
208 requests fired
Scraping Maroelamedia.co.za starting from 2020-3-1 to 2020-4-30
No data from Maroelamedia.co.za during this timeframe!
209 requests fired
Scraping Timeslive.co.za starting from 2020-3-1 to 2020-4-30
Saved data from 100 articles of Timeslive.co.za to .csv!
210 requests fire

 17%|█████████████▎                                                                  | 2/12 [13:27<1:07:54, 407.46s/it]

Saved data from 100 articles of economictimes.indiatimes.com to .csv!
406 requests fired
Scraping News24.com starting from 2020-5-1 to 2020-6-30
Saved data from 100 articles of News24.com to .csv!
407 requests fired
Scraping Gumtree.co.za starting from 2020-5-1 to 2020-6-30
No data from Gumtree.co.za during this timeframe!
408 requests fired
Scraping Cars.co.za starting from 2020-5-1 to 2020-6-30
No data from Cars.co.za during this timeframe!
409 requests fired
Scraping BusinessTech.co.za starting from 2020-5-1 to 2020-6-30
No data from BusinessTech.co.za during this timeframe!
410 requests fired
Scraping Iol.co.za starting from 2020-5-1 to 2020-6-30
Saved data from 100 articles of Iol.co.za to .csv!
411 requests fired
Scraping Maroelamedia.co.za starting from 2020-5-1 to 2020-6-30
No data from Maroelamedia.co.za during this timeframe!
412 requests fired
Scraping Timeslive.co.za starting from 2020-5-1 to 2020-6-30
Saved data from 100 articles of Timeslive.co.za to .csv!
413 requests fi

 25%|████████████████████                                                            | 3/12 [19:54<1:00:12, 401.37s/it]

Saved data from 67 articles of economictimes.indiatimes.com to .csv!
609 requests fired
Scraping News24.com starting from 2020-7-1 to 2020-8-31
Saved data from 100 articles of News24.com to .csv!
610 requests fired
Scraping Gumtree.co.za starting from 2020-7-1 to 2020-8-31
No data from Gumtree.co.za during this timeframe!
611 requests fired
Scraping Cars.co.za starting from 2020-7-1 to 2020-8-31
No data from Cars.co.za during this timeframe!
612 requests fired
Scraping BusinessTech.co.za starting from 2020-7-1 to 2020-8-31
Saved data from 2 articles of BusinessTech.co.za to .csv!
613 requests fired
Scraping Iol.co.za starting from 2020-7-1 to 2020-8-31
Saved data from 100 articles of Iol.co.za to .csv!
614 requests fired
Scraping Maroelamedia.co.za starting from 2020-7-1 to 2020-8-31
No data from Maroelamedia.co.za during this timeframe!
615 requests fired
Scraping Timeslive.co.za starting from 2020-7-1 to 2020-8-31
Saved data from 100 articles of Timeslive.co.za to .csv!
616 requests 

 33%|███████████████████████████▎                                                      | 4/12 [26:19<52:52, 396.62s/it]

Saved data from 79 articles of economictimes.indiatimes.com to .csv!
812 requests fired
Scraping News24.com starting from 2020-9-1 to 2020-10-31
Saved data from 100 articles of News24.com to .csv!
813 requests fired
Scraping Gumtree.co.za starting from 2020-9-1 to 2020-10-31
No data from Gumtree.co.za during this timeframe!
814 requests fired
Scraping Cars.co.za starting from 2020-9-1 to 2020-10-31
No data from Cars.co.za during this timeframe!
815 requests fired
Scraping BusinessTech.co.za starting from 2020-9-1 to 2020-10-31
No data from BusinessTech.co.za during this timeframe!
816 requests fired
Scraping Iol.co.za starting from 2020-9-1 to 2020-10-31
Saved data from 100 articles of Iol.co.za to .csv!
817 requests fired
Scraping Maroelamedia.co.za starting from 2020-9-1 to 2020-10-31
No data from Maroelamedia.co.za during this timeframe!
818 requests fired
Scraping Timeslive.co.za starting from 2020-9-1 to 2020-10-31
Saved data from 100 articles of Timeslive.co.za to .csv!
819 reque

 42%|██████████████████████████████████▏                                               | 5/12 [32:52<46:07, 395.37s/it]

Saved data from 100 articles of economictimes.indiatimes.com to .csv!
1015 requests fired
Scraping News24.com starting from 2020-11-1 to 2020-12-31
Saved data from 100 articles of News24.com to .csv!
1016 requests fired
Scraping Gumtree.co.za starting from 2020-11-1 to 2020-12-31
No data from Gumtree.co.za during this timeframe!
1017 requests fired
Scraping Cars.co.za starting from 2020-11-1 to 2020-12-31
No data from Cars.co.za during this timeframe!
1018 requests fired
Scraping BusinessTech.co.za starting from 2020-11-1 to 2020-12-31
No data from BusinessTech.co.za during this timeframe!
1019 requests fired
Scraping Iol.co.za starting from 2020-11-1 to 2020-12-31
Saved data from 100 articles of Iol.co.za to .csv!
1020 requests fired
Scraping Maroelamedia.co.za starting from 2020-11-1 to 2020-12-31
No data from Maroelamedia.co.za during this timeframe!
1021 requests fired
Scraping Timeslive.co.za starting from 2020-11-1 to 2020-12-31
Saved data from 100 articles of Timeslive.co.za to 

 50%|█████████████████████████████████████████                                         | 6/12 [39:35<39:46, 397.70s/it]

Saved data from 100 articles of economictimes.indiatimes.com to .csv!
1218 requests fired
Scraping News24.com starting from 2021-1-1 to 2021-2-28
Saved data from 100 articles of News24.com to .csv!
1219 requests fired
Scraping Gumtree.co.za starting from 2021-1-1 to 2021-2-28
No data from Gumtree.co.za during this timeframe!
1220 requests fired
Scraping Cars.co.za starting from 2021-1-1 to 2021-2-28
No data from Cars.co.za during this timeframe!
1221 requests fired
Scraping BusinessTech.co.za starting from 2021-1-1 to 2021-2-28
No data from BusinessTech.co.za during this timeframe!
1222 requests fired
Scraping Iol.co.za starting from 2021-1-1 to 2021-2-28
Saved data from 100 articles of Iol.co.za to .csv!
1223 requests fired
Scraping Maroelamedia.co.za starting from 2021-1-1 to 2021-2-28
No data from Maroelamedia.co.za during this timeframe!
1224 requests fired
Scraping Timeslive.co.za starting from 2021-1-1 to 2021-2-28
Saved data from 100 articles of Timeslive.co.za to .csv!
1225 req

 58%|███████████████████████████████████████████████▊                                  | 7/12 [46:19<33:17, 399.46s/it]

Saved data from 100 articles of economictimes.indiatimes.com to .csv!
1421 requests fired
Scraping News24.com starting from 2021-3-1 to 2021-4-30
Saved data from 100 articles of News24.com to .csv!
1422 requests fired
Scraping Gumtree.co.za starting from 2021-3-1 to 2021-4-30
No data from Gumtree.co.za during this timeframe!
1423 requests fired
Scraping Cars.co.za starting from 2021-3-1 to 2021-4-30
No data from Cars.co.za during this timeframe!
1424 requests fired
Scraping BusinessTech.co.za starting from 2021-3-1 to 2021-4-30
No data from BusinessTech.co.za during this timeframe!
1425 requests fired
Scraping Iol.co.za starting from 2021-3-1 to 2021-4-30
Saved data from 100 articles of Iol.co.za to .csv!
1426 requests fired
Scraping Maroelamedia.co.za starting from 2021-3-1 to 2021-4-30
No data from Maroelamedia.co.za during this timeframe!
1427 requests fired
Scraping Timeslive.co.za starting from 2021-3-1 to 2021-4-30
Saved data from 100 articles of Timeslive.co.za to .csv!
1428 req

 67%|██████████████████████████████████████████████████████▋                           | 8/12 [53:14<26:56, 404.12s/it]

Saved data from 100 articles of economictimes.indiatimes.com to .csv!
1624 requests fired
Scraping News24.com starting from 2021-5-1 to 2021-6-30
Saved data from 100 articles of News24.com to .csv!
1625 requests fired
Scraping Gumtree.co.za starting from 2021-5-1 to 2021-6-30
No data from Gumtree.co.za during this timeframe!
1626 requests fired
Scraping Cars.co.za starting from 2021-5-1 to 2021-6-30
No data from Cars.co.za during this timeframe!
1627 requests fired
Scraping BusinessTech.co.za starting from 2021-5-1 to 2021-6-30
No data from BusinessTech.co.za during this timeframe!
1628 requests fired
Scraping Iol.co.za starting from 2021-5-1 to 2021-6-30
Saved data from 100 articles of Iol.co.za to .csv!
1629 requests fired
Scraping Maroelamedia.co.za starting from 2021-5-1 to 2021-6-30
No data from Maroelamedia.co.za during this timeframe!
1630 requests fired
Scraping Timeslive.co.za starting from 2021-5-1 to 2021-6-30
Saved data from 100 articles of Timeslive.co.za to .csv!
1631 req

 75%|█████████████████████████████████████████████████████████████▌                    | 9/12 [59:37<19:53, 397.83s/it]

Saved data from 77 articles of economictimes.indiatimes.com to .csv!
1827 requests fired
Scraping News24.com starting from 2021-7-1 to 2021-8-31
No data from News24.com during this timeframe!
1828 requests fired
Scraping Gumtree.co.za starting from 2021-7-1 to 2021-8-31
No data from Gumtree.co.za during this timeframe!
1829 requests fired
Scraping Cars.co.za starting from 2021-7-1 to 2021-8-31
No data from Cars.co.za during this timeframe!
1830 requests fired
Scraping BusinessTech.co.za starting from 2021-7-1 to 2021-8-31
No data from BusinessTech.co.za during this timeframe!
1831 requests fired
Scraping Iol.co.za starting from 2021-7-1 to 2021-8-31
No data from Iol.co.za during this timeframe!
1832 requests fired
Scraping Maroelamedia.co.za starting from 2021-7-1 to 2021-8-31
No data from Maroelamedia.co.za during this timeframe!
1833 requests fired
Scraping Timeslive.co.za starting from 2021-7-1 to 2021-8-31
No data from Timeslive.co.za during this timeframe!
1834 requests fired
Scra

 83%|█████████████████████████████████████████████████████████████████▊             | 10/12 [1:05:17<12:41, 380.53s/it]

No data from economictimes.indiatimes.com during this timeframe!
2030 requests fired
Scraping News24.com starting from 2021-9-1 to 2021-10-31
No data from News24.com during this timeframe!
2031 requests fired
Scraping Gumtree.co.za starting from 2021-9-1 to 2021-10-31
No data from Gumtree.co.za during this timeframe!
2032 requests fired
Scraping Cars.co.za starting from 2021-9-1 to 2021-10-31
No data from Cars.co.za during this timeframe!
2033 requests fired
Scraping BusinessTech.co.za starting from 2021-9-1 to 2021-10-31
No data from BusinessTech.co.za during this timeframe!
2034 requests fired
Scraping Iol.co.za starting from 2021-9-1 to 2021-10-31
No data from Iol.co.za during this timeframe!
2035 requests fired
Scraping Maroelamedia.co.za starting from 2021-9-1 to 2021-10-31
No data from Maroelamedia.co.za during this timeframe!
2036 requests fired
Scraping Timeslive.co.za starting from 2021-9-1 to 2021-10-31
No data from Timeslive.co.za during this timeframe!
2037 requests fired
S

 92%|████████████████████████████████████████████████████████████████████████▍      | 11/12 [1:11:01<06:09, 369.62s/it]

No data from economictimes.indiatimes.com during this timeframe!
2233 requests fired
Scraping News24.com starting from 2021-11-1 to 2021-12-31
No data from News24.com during this timeframe!
2234 requests fired
Scraping Gumtree.co.za starting from 2021-11-1 to 2021-12-31
No data from Gumtree.co.za during this timeframe!
2235 requests fired
Scraping Cars.co.za starting from 2021-11-1 to 2021-12-31
No data from Cars.co.za during this timeframe!
2236 requests fired
Scraping BusinessTech.co.za starting from 2021-11-1 to 2021-12-31
No data from BusinessTech.co.za during this timeframe!
2237 requests fired
Scraping Iol.co.za starting from 2021-11-1 to 2021-12-31
No data from Iol.co.za during this timeframe!
2238 requests fired
Scraping Maroelamedia.co.za starting from 2021-11-1 to 2021-12-31
No data from Maroelamedia.co.za during this timeframe!
2239 requests fired
Scraping Timeslive.co.za starting from 2021-11-1 to 2021-12-31
No data from Timeslive.co.za during this timeframe!
2240 requests 

100%|███████████████████████████████████████████████████████████████████████████████| 12/12 [1:16:45<00:00, 383.79s/it]

No data from economictimes.indiatimes.com during this timeframe!
2436 requests fired
Wall time: 1h 16min 45s





In [16]:
response

{'message': 'You have exceeded the rate limit per hour for your plan, BASIC, by the API provider'}

In [154]:
# response['articles']

In [18]:
# url = "https://google-news.p.rapidapi.com/v1/source_search"

# querystring = {"q": "women OR woman OR girl OR female OR lady OR ladies OR she OR her OR herself OR wife OR mom OR mother OR sister", 
#                        "lang":"en","from":"2005-1-1","to":"2005-2-28","source":"economictimes.indiatimes.com"}

# # "women OR woman OR girl OR female OR lady OR ladies OR she OR her OR herself OR aunt OR grandmother OR mother OR sister OR daughter OR wife OR mom OR mum OR girlfriend OR mrs OR niece"        
# headers = {
#     'x-rapidapi-key': "bd5010b7eemsh00ab5684fa6a7efp15b791jsnd00e2d098829",
#     'x-rapidapi-host': "google-news.p.rapidapi.com"
#     }

# response = requests.request("GET", url, headers=headers, params=querystring).json()

# # print(response.text)

In [19]:
# response

{'feed': {'title': '"allinurl:economictimes.indiatimes.com women OR woman OR girl OR female OR lady OR ladies OR she OR her OR herself OR wife OR mom OR mother OR sister after:2005-01-01 before:2005-02-28" - Google News',
  'updated': 'Fri, 14 May 2021 21:19:29 GMT',
  'link': 'https://news.google.com/search?q=allinurl:economictimes.indiatimes.com+women+OR+woman+OR+girl+OR+female+OR+lady+OR+ladies+OR+she+OR+her+OR+herself+OR+wife+OR+mom+OR+mother+OR+sister+after:2005-01-01+before:2005-02-28&ceid=US:en&hl=en-US&gl=US',
  'language': 'en-US',
  'subtitle': 'Google News',
  'rights': '2021 Google Inc.'},
 'articles': []}