# GETTING THE LINKS OF REPORTS

In [18]:
import requests
from bs4 import BeautifulSoup as bs
import datetime
from dateutil.rrule import rrule, DAILY
import pandas as pd

In [14]:
# Initializing empty lists to store links and published dates
link = []
published_dates = []

# Setting start and end dates for the date range
date = datetime.date(2023, 1, 1)
end = datetime.date(2023, 12, 31)

# Looping through each day in the date range
for current_date in rrule(DAILY, dtstart=date, until=end):
    # Setting the URL and POST parameters for the request
    url = 'https://www.thenews.com.pk/todaypaper-archive'
    post_params = {'filter_archive_date': current_date.strftime('%Y-%m-%d'), 'submit_archive': 'Submit'}

    retries = 3  # Setting the maximum number of retries for the request
    for attempt in range(retries): # Looping through the retries
        try:
            response = requests.post(url, data=post_params)
            response.raise_for_status()  
            soup = bs(response.text, 'html.parser')
            data_list = soup.find_all("div", {"class": "print-top-story"})
            if len(data_list) > 4: # If there are more than 4 elements, extract the links and published dates
                data = data_list[4]
                for data1 in data.find_all("a", {"class": "fc-item__link open-section"}):
                    links = data1['href']
                    if links not in link:
                        published_dates.append(current_date)
                        link.append(links)
            else:
                published_dates.append("")
                link.append("")
            break  
        except (requests.RequestException, bs4.FeatureNotFound, bs4.ParserError, bs4.SoupStrainer) as e: 
            print(f"Attempt {attempt+1} failed:", e) # Catching any exceptions that occur during the request or parsing
            if attempt == retries - 1:
                print("Max retries exceeded.")
                break

df = pd.DataFrame({'links': link, 'dates': published_dates})
with pd.ExcelWriter('Project-The News.xlsx') as writer: # Writing the DataFrame to an Excel file
    df.to_excel(writer, sheet_name='news')

# GETTING REPORTS USING THE LINKS

In [22]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [21]:
# Creating Function to extract report text from a given URL
def extract_report_text(url):
    if pd.isna(url):  # Checking if the URL is NaN
        return ''
    
    retries = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session = requests.Session()
    session.mount('https://', HTTPAdapter(max_retries=retries))
    
    try:
        response = session.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        report_element = soup.select_one('.story-detail')
        if report_element:
            return report_element.get_text(strip=True)
        else:
            return ''
    except (requests.RequestException, ValueError, AttributeError, IndexError, TypeError) as e:
        print(f"Error while extracting report text from {url}: {e}")
        return ''

# Reading the URLs from Excel
links_df = pd.read_excel('Project-The News.xlsx', usecols=['links'])

# Filtering out NaN values in the 'links' column
links_df = links_df.dropna()

# Extracting report text for each URL
reports = []
for url in links_df['links']:
    report_text = extract_report_text(url)
    reports.append(report_text)

# Creating DataFrame and saving to Excel
df = pd.DataFrame({'reports': reports})
with pd.ExcelWriter('Reports.xlsx') as writer:
    df.to_excel(writer, sheet_name='news')