In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

In [51]:
# Function to check if a date is within the past 2 years
def within_past_2_months(date_string):
    current_date = datetime.now() #current date
    news_date = datetime.strptime(date_string, "%B %d, %Y")  
    two_months_ago = current_date - timedelta(days=30*2)  # Approximate 2 months as 30 days each
    return news_date >= two_months_ago 
        #compares date with the current date. 
# It returns True if the date is within the past 2 years.

In [48]:
page_html=[] #create list for different pages' html
urls = [] #create list to hold news urls
dates = [] # create list to hold news dates
body_list=[] # create list to hold news body
header_list=[] # create list to hold news header

page_count = 0
stop_loop = False  # To control when to stop the loop

while not stop_loop:
    
    page_count += 1
    print(f"Page number: {page_count}") #Displaying page number     
    
    page_URL = 'https://www.amnesty.org/en/news/page/{}/'.format(page_count) 
    print(page_URL)
    USER_AGENT='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
    headers = {"user-agent": USER_AGENT} 
    page = requests.get(page_URL, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")
    
    page_html.append(soup) #store page htmls 
    
    for html in page_html:
        
        # find all 'article' on html code on each page
        newsletters = html.find_all("article", class_="post postImage--small aimc-ignore") 
        for newsletter in newsletters:
            link = newsletter.figure.a["href"]
            urls.append(link) #store the links
            date = newsletter.find("span", class_="post-meta").get_text().strip()
            dates.append(date) # store the dates
            
            print(f"Link: {link}")
            print(f"Date: {date}")
            
            # Check if the date is not within the past 2 years
            if not within_past_2_months(date):
                stop_loop = True
                print("It is not within two months")
                break  # Break the loop if the condition is met
            else:
                print("**else**")
            
                      
                #getting content from one link    
                URL2 = link
                page = requests.get(URL2, headers=headers)
                soup =  BeautifulSoup(page.content, "html.parser")
                contents = soup.find_all("section", class_="article has-sidebar") #html code for news contents
                
                for content in contents:
                #extract text of header
                    header = content.find("h1",class_="article-title").get_text().strip() 
                    #multiple paragraphs starting with tag 'p'
                    body = content.find("article",class_="article-content").find_all('p') 
                    print(f"< Header > {header}") 
                    print("< Body >")
                    print("****************************************************************") # news division
                    body_content =[] #temporary list to hold all paragraphs
                
                    # extract text of multiple paragraphs
                    for i in range(0,len(body)): # range(start, stop)
                                #stop: An integer number specifying at which position to stop (not included).
                        body_text=body[i].get_text().strip() #get text from each paragraph
                        body_content.append(body_text) #store paragrahs
                    #print(body_content)
                
                    header_list.append(header) #store header
                    body_list.append(body_content)#store body using list 'body_content'
           
        print(f"---------------------------Page {page_count}-------------------------------") #page division
        if stop_loop:
            break  # Break the outer loop if the condition is met
  
    

Page number: 1
https://www.amnesty.org/en/news/page/1/
Link: https://www.amnesty.org/en/latest/news/2023/09/iraq-four-years-after-tishreen-protests-no-justice-for-state-and-militia-violence/
Date: September 27, 2023
**else**
< Header > Iraq: Four years after Tishreen protests, no justice for state and militia violence
< Body >
****************************************************************
Link: https://www.amnesty.org/en/latest/news/2023/09/india-government-weaponizing-terrorism-financing-watchdog-recommendations-against-civil-society/
Date: September 27, 2023
**else**
< Header > India: Government weaponizing terrorism financing watchdog recommendations against civil society
< Body >
****************************************************************
Link: https://www.amnesty.org/en/latest/news/2023/09/thailand-upcoming-verdict-in-case-of-murdered-indigenous-activist-billy-must-deliver-justice/
Date: September 26, 2023
**else**
< Header > Thailand: Upcoming verdict in case of murdered I

## dataframe to csv file

In [37]:
body_list[1]

['Download',
 'Indian authorities are exploiting the recommendations of a global terrorism financing and money laundering watchdog to target civil society groups and activists and deliberately hinder their work, said Amnesty International in a new briefing released today.',
 '“Weaponizing counter-terrorism: India’s exploitation of terrorism financing assessments to target civil society” reveals how the recommendations of the Financial Action Task Force (FATF)—a global body responsible for tackling terrorism financing and money laundering—have been abused by the Indian authorities to bring in draconian laws in a coordinated campaign to stifle the non-profit sector. These laws are in turn used to bring terrorism-related charges and, amongst other things, to prevent organizations and activists from accessing essential funds.',
 '“Under the guise of combatting terrorism, the Indian government has leveraged the Financial Action Task Force’s recommendations to tighten its arsenal of financia

In [49]:
#converting information into dataframe
df=pd.DataFrame(list(zip(urls,dates,header_list, body_list)),columns =['URL','Date','Header','Body'])

df[0:3]

Unnamed: 0,URL,Date,Header,Body
0,https://www.amnesty.org/en/latest/news/2023/09...,"September 27, 2023","Iraq: Four years after Tishreen protests, no j...","[The Iraqi authorities must ensure truth, just..."
1,https://www.amnesty.org/en/latest/news/2023/09...,"September 27, 2023",India: Government weaponizing terrorism financ...,"[Download, Indian authorities are exploiting t..."
2,https://www.amnesty.org/en/latest/news/2023/09...,"September 26, 2023",Thailand: Upcoming verdict in case of murdered...,[Ahead of the expected verdict on Thursday (28...


In [50]:
df.to_csv("news_scraped.csv",index=True ) #converting the dataframe to csv file.