In [37]:
import threading
import queue
import requests

qe = queue.Queue()
ipv = []
lock = threading.Lock()

with open(r"ips.txt", "r") as f:
    ips = f.read().split("\n")
    for p in ips:
        qe.put(p)

def ipcheck():
    global q
    while True:
        with lock:
            if q.empty():
                break
            proxy = q.get()

        try:
            r = requests.get("https://ipinfo.io/json",
                               ips={"http": proxy, "https": proxy},
                               timeout=5)

            if r.status_code == 200:
                with lock:
                    ipv.append(proxy)
                print(f"Valid proxy: {proxy}")

        except requests.RequestException as e:
            print(f"Error with proxy {proxy}: {e}")
            continue

threads = []
for _ in range(10):
    thread = threading.Thread(target=ipcheck)
    threads.append(thread)
    thread.start()

for thread in threads:
    thread.join()

print(ipv)

[]


In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

def search_and_scrape_hrefs(url, keyword, starting_url, max_pages=1):
    all_hrefs = []

    for page in range(1, max_pages + 1):
        # extracing the urls of all articals 
        search_url = f"{url}?q={keyword}&page={page}"
        search_soup = get_soup(search_url)

        # Extract and return hrefs that start with the specified url from the search results
        hrefs = [urljoin(starting_url[keyword], a['href']) for a in search_soup.find_all('a', href=True) if a['href'].startswith(starting_url[keyword])]
        all_hrefs.extend(hrefs)

    return all_hrefs

def scrape_content(url):
    soup = get_soup(url)
    title = soup.find('title').text if soup.find('title') else 'Title not found'
    # Extract all dates, paragraphs, and image links
    date = soup.find('time', {'data-testid': 'timestamp'}).text.strip() if soup.find('time', {'data-testid': 'timestamp'}) else 'Date not found'
    paragraphs = [p.text.strip() for p in soup.find_all('p')]
    images = [urljoin(url, img['src']) for img in soup.find_all('img', src=True)]

    return title, url, date, paragraphs, images

def get_soup(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

# saving to csv file
def save_to_csv(data, csv_filename):
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['Title', 'URLs','Date', 'Paragraph', 'Images'])

        for row in data:
            csv_writer.writerow(row)

def main():
    target_url = "https://www.bbc.co.uk/search"  # target URL
    max_pages = 29  # total page number

    # search words and starting urls
    starting_urls = {
        "Pervez Musharraf": "https://www.bbc.co.uk/news/world",
        "Zulfikar Ali Bhutto": "https://www.bbc.co.uk/news/world",
        "Benazir Bhutto": "https://www.bbc.co.uk/news/world",
        "Nawaz Sharif": "https://www.bbc.co.uk/news/world",
        "Zafarullah Khan Jamali": "https://www.bbc.co.uk/news/world",
        "Yusuf Raza Gilani": "https://www.bbc.co.uk/news/world",
        "Imran Khan": "https://www.bbc.co.uk/news/world",
        "Shahid Khaqan Abbasi": "https://www.bbc.co.uk/news/world",
        "Shehbaz Sharif": "https://www.bbc.co.uk/news/world",
        "Shaukat Aziz": "https://www.bbc.co.uk/news/world",
        "Mamnoon Hussain": "https://www.bbc.co.uk/news/world",
        "Asif Ali Zardari": "https://www.bbc.co.uk/news/world",
        "Bilawal Bhutto Zardari": "https://www.bbc.co.uk/news/world",
    }

    all_data = []

    for keyword, starting_url in starting_urls.items():
        # searching and scraping hrefs for each keyword
        hrefs = search_and_scrape_hrefs(target_url, keyword, starting_urls, max_pages)

        # Printing the extracted hrefs 
        print(f"\n Scraped HREFs for {keyword}:")
        for i, href in enumerate(hrefs, 1):
            print(f"{i}. {href}")

            # Scraping content and storing in a list
            title, url, date, paragraphs, images = scrape_content(href)
            all_data.append([title, "".join(url),"".join(date), "\n".join(paragraphs), ", ".join(images)])

    # Save data to CSV file
    csv_filename = 'scraped_data.csv'
    save_to_csv(all_data, csv_filename)
    print(f"\nData saved to {csv_filename}")

if __name__ == '__main__':
    main()



 Scraped HREFs for Pervez Musharraf:
1. https://www.bbc.co.uk/news/world-asia-64555401
2. https://www.bbc.co.uk/news/world-asia-64528348
3. https://www.bbc.co.uk/news/world-64529074
4. https://www.bbc.co.uk/news/world-asia-india-61769226
5. https://www.bbc.co.uk/news/world-asia-22248479
6. https://www.bbc.co.uk/news/world-asia-21861989
7. https://www.bbc.co.uk/news/world-asia-50819772
8. https://www.bbc.co.uk/news/world-asia-27811673
9. https://www.bbc.co.uk/news/world-asia-26830780
10. https://www.bbc.co.uk/news/world-asia-26817341
11. https://www.bbc.co.uk/news/world-asia-26239352
12. https://www.bbc.co.uk/news/world-asia-26239958
13. https://www.bbc.co.uk/news/world-asia-25982497
14. https://www.bbc.co.uk/news/world-asia-25663987
15. https://www.bbc.co.uk/news/world-asia-25460336
16. https://www.bbc.co.uk/news/world-asia-24847160
17. https://www.bbc.co.uk/news/world-asia-24461468
18. https://www.bbc.co.uk/news/world-asia-23764633
19. https://www.bbc.co.uk/news/world-asia-23765062
2