In [None]:
import pandas as pd
import json
import requests
import bs4
import re

Create a function that receives as parameters the query we want to search, the numer of result you want to start your search and the number of results you want to request. As the API only lets you download 100 results at a time, in order to get the first 300 results (where the most meaningfull content is), we need to perform 3 calls.

In [None]:
def requester(query, start, results):
    
    global df

    headers = { 
    "apikey": "your_api"}

    params = (
   ("q",query),
   ("num", results),
   ("start", start)
    )

    resp = requests.get('https://app.zenserp.com/api/v2/search', headers=headers, params=params);

    data = json.loads(resp.text)
    df_partial = pd.json_normalize(data["organic"])

# if/else statement to concatenate results of iteration trough different starting points

    if start == "0":
        df = df_partial
    else:
        df=pd.concat([df, df_partial])

#add a column with the query used
    df["query"] = query


In the next block, start points are established, they are the variable that changes to perform the number of calls to the API. In this case, we want 100 results starting from 0, 100 results starting from 100, and 100 results starting from 200.

In [None]:
start_points = [0, 100, 200]
query = "your_query"

df = pd.DataFrame()

for i in start_points:
    requester(query, i, 100)

The result is a df with the first 300 results. Having the urls for each article will let us scrap the sites to search more information

In [None]:
df.head(5)

Unnamed: 0,position,title,url,destination,description,isAmp,images,isCarousel,moreUrl,query
0,1,Noticias de Narcotráfico - Santa Fe,https://www.ellitoral.com/tags/narcotrafico,https://www.ellitoral.com › tags,Narcotráfico. Parte de los elementos incautado...,False,,,,narcotrafico site:ellitoral.com -impresa
1,2,Colombia propone una alianza latinoamericana p...,https://www.ellitoral.com/internacionales/colo...,https://www.ellitoral.com › colom...,"Sep 10, 2023 — Colombia, al igual que otros pa...",False,,,,narcotrafico site:ellitoral.com -impresa
2,3,México y Estados Unidos se unen contra narcotr...,https://www.ellitoral.com/internacionales/mexi...,https://www.ellitoral.com › mexico...,"Jul 25, 2023 — México y Estados Unidos se unen...",False,,,,narcotrafico site:ellitoral.com -impresa
3,4,Cayó una banda que modificaba llantas para lle...,https://www.ellitoral.com/sucesos/gendarmeria-...,https://www.ellitoral.com › sucesos,"Nov 2, 2023 — Además de las detenciones, se lo...",False,,,,narcotrafico site:ellitoral.com -impresa
4,5,Vendían drogas en plena ciudad de Rosario y fu...,https://www.ellitoral.com/sucesos/rosario-ahor...,https://www.ellitoral.com › sucesos,"Nov 16, 2023 — La Dirección de Investigación s...",False,,,,narcotrafico site:ellitoral.com -impresa


Then we proceed to clean the df, supressing irrelevant columns and dropping duplicates

In [None]:
#drop first row that is not an article
df = df[1:]

# drop duplicates based on url
df = df.drop_duplicates(subset="url", ignore_index=True)

# drop irrelevant columns

df.drop(columns=["images", "isCarousel", "moreUrl", "isAmp", "destination", "position"], inplace=True)

# drop NaN values
df.dropna(inplace=True)


# Scrapping date

Here we create a function that searches for the date in the html of a given url. For that we use the library "BeautifulSoup", a similar procces will be done to retrieve title and content of the articles

In [None]:
# create a list with urls
h = df.url.to_list()

def date_extract(url):
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    try:
        a = soup.find('div',{'itemprop': 'datePublished'}).next_element
        return a
    except AttributeError:
        try:
            a = soup.find('time').next_element
            return a
        except AttributeError:
            return None


#append dates to a list that will become the "date" column
b = []

for i in h:
    b.append(date_extract(i))

b
df["date"] = b


In [None]:
#keep only numbers, "." and "-"
df['date'] = df['date'].astype(str).str.replace('[^0-9\-.]', '')

We perform some transformations to the dates requested, deleting accesory characters

In [None]:
df["date"] = b

#drop rows that contains NaN, thats how we previously tagged urls that where impossible to bring back a date for
df = df[df['date'].notna()]

df["date"]=df["date"].apply(str)
#remove end of strings with accesory characters
df["date"] = df["date"].str.slice(-10)
#remove empty spaces
df['date'] = df['date'].str.replace(' ', '')
df['date']

In [None]:
#saco letras que puedan haber quedado
date_list = df["date"].to_list()

date_list2 = []

for i in date_list:
    i = i.translate(str.maketrans('','','abcdefghijklmnñopqrstuvwxyz'))
    date_list2.append(i)

date_list2

df["date"] = date_list2

df['date']=pd.to_datetime(df['date'])

## Scrapping Title and content

Extract Titles

In [None]:
#creo una lista con las url
h = df.url.to_list()

def date_extract(url):
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    try:
        title = soup.find('h1',{'class': 'headline-text'}).next_element
        title = str(title)
        return title
    except AttributeError:
        return "NaN"


#creo una lista nueva que voy a llenar iterando con la función date_extract()
b = []

for i in h:
    b.append(date_extract(i))

df["title"] = b

In [None]:
df["title"][df["title"] == "NaN"].count()

14

Extract subtitles

In [None]:
h = df.url.to_list()

def subt_extract(url):
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    try:
        subt_content = soup.findAll('p')
        subtitle = subt_content[0]
        subtitle = str(subtitle)
        return subtitle

    except AttributeError:
        return "NaN"
    except IndexError:
        return "indexerror"

b = []

for i in h:
    b.append(subt_extract(i))

df["subtitle"] = b


#clean result from unwanted characters and brackets from the html source
def clean(x):
    x = re.sub("([\<]).*?([\>])", "\g<1>\g<2>", x)
    x = re.sub("[\<].*?[\>]", "", x)
    return x

df["subtitle"] = df["subtitle"].apply(clean)


Extract content of the article

In [None]:
def content_extract(url):
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    try:
        text_content = soup.findAll('p')
        content = text_content[1:]
        content = str(content)
        return content

    except AttributeError:
        return "NaN"
    except IndexError:
        return "indexerror"

b = []

for i in h:
    b.append(content_extract(i))

df["content"] = b

#clean result from unwanted characters and brackets from the html source

def clean(x):
    x = re.sub("([\<]).*?([\>])", "\g<1>\g<2>", x)
    x = re.sub("[\<].*?[\>]", "", x)
    return x

df["content"] = df["content"].apply(clean)

df['content'] = df['content'].str.replace(r"\[","")
df['content'] = df['content'].str.replace(r"\]","")

Save results to xlsx for sharing and csv for further work

In [None]:
df.to_csv("file_name.csv")
df.to_excel("file_name.xlsx")