<a href="https://colab.research.google.com/github/ol287/News_data_API/blob/main/pullinglivedata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import requests
import pandas as pd
import os
import numpy as np
from urllib import request
from concurrent.futures import ThreadPoolExecutor, as_completed

# Define the API endpoint and parameters
api_url = "https://newsapi.org/v2/everything"
params = {
    "q": "political",
    "apiKey": "3fc6bfd0e2134566ae743a5eee447ebd"
}

try:
    # Make the API request
    response = requests.get(api_url, params=params)
    response.raise_for_status()  # Raise an error for bad status codes

    # Parse the JSON response
    data = response.json()

    # Extract relevant data into a DataFrame
    if "articles" in data:
        df = pd.DataFrame(data["articles"])
        print("Data successfully retrieved and stored in a DataFrame.")
    else:
        print("No articles found in the API response.")
        df = pd.DataFrame()

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")
    df = pd.DataFrame()

# Display the DataFrame
print(df.head())

print(df.shape)

print(df.columns)


# Replace NaN values with empty strings
df = df.replace(np.nan, '', regex=True)
df.fillna('', inplace=True)
df['id'] = range(len(df))

# Create the directory to store images if it doesn't exist
image_dir = "/newsapi/images"
if not os.path.exists(image_dir):
    os.makedirs(image_dir)

# Function to download a single image
def download_image(row):
    image_url = row["urlToImage"]
    image_path = os.path.join(image_dir, f"{row['id']}.jpg")

    try:
        # Download the image and save it to the file path
        with open(image_path, 'wb') as f:
            f.write(request.urlopen(image_url).read())
        return True  # Success
    except Exception as e:
        return False  # Failed to download

# Function to process a single row in the dataframe
def process_row(index, row):
    if row["urlToImage"] not in ["", "nan"]:
        success = download_image(row)
        if not success:
            # Remove the row from the dataframe if download fails
            return index
    return None

# Use ThreadPoolExecutor to download images concurrently
failed_indices = []
with ThreadPoolExecutor(max_workers=10) as executor:  # You can adjust the number of threads with max_workers
    futures = {executor.submit(process_row, index, row): index for index, row in df.iterrows()}

    for future in as_completed(futures):
        index = futures[future]
        try:
            failed_index = future.result()
            if failed_index is not None:
                failed_indices.append(failed_index)
        except Exception as exc:
            print(f"An error occurred for row {index}: {exc}")

# Remove failed rows from the dataframe
df.drop(index=failed_indices, inplace=True)
df.reset_index(drop=True, inplace=True)

print("Downloaded all images.")
df.head()


Data successfully retrieved and stored in a DataFrame.
                                     source                author  \
0  {'id': 'the-verge', 'name': 'The Verge'}        Sean Hollister   
1          {'id': None, 'name': 'BBC News'}                  None   
2  {'id': 'the-verge', 'name': 'The Verge'}     Andrew J. Hawkins   
3          {'id': 'wired', 'name': 'Wired'}      Vittoria Elliott   
4               {'id': None, 'name': 'NPR'}  The Associated Press   

                                               title  \
0  DJI claims its decision to let drones fly in d...   
1  Johnson UK's most damaging PM, says Reform UK ...   
2  More Tesla showroom protests planned for this ...   
3  Elon Musk Lackeys Have Taken Over the Office o...   
4  Greenland bans foreign political donations as ...   

                                         description  \
0  DJI claims its update that lets drone owners f...   
1  Zia Yusuf tells the BBC's Political Thinking p...   
2  Protestors are plannin

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,id
0,"{'id': 'the-verge', 'name': 'The Verge'}",Sean Hollister,DJI claims its decision to let drones fly in d...,DJI claims its update that lets drone owners f...,https://www.theverge.com/2025/1/15/24344579/dj...,https://cdn.vox-cdn.com/thumbor/iUcyJLiZjgsz7B...,2025-01-15T20:24:42Z,DJI claims its decision to let drones fly in d...,0
1,"{'id': None, 'name': 'BBC News'}",,"Johnson UK's most damaging PM, says Reform UK ...",Zia Yusuf tells the BBC's Political Thinking p...,https://www.bbc.com/news/articles/cm23p33emrdo,https://ichef.bbci.co.uk/news/1024/branded_new...,2025-02-07T01:04:16Z,Joshua Nevett\r\nBoris Johnson will go down as...,1
2,"{'id': 'the-verge', 'name': 'The Verge'}",Andrew J. Hawkins,More Tesla showroom protests planned for this ...,Protestors are planning more demonstrations ou...,https://www.theverge.com/news/612912/tesla-pro...,https://platform.theverge.com/wp-content/uploa...,2025-02-14T15:04:22Z,Activists are calling for demonstrations at Te...,2
3,"{'id': 'wired', 'name': 'Wired'}",Vittoria Elliott,Elon Musk Lackeys Have Taken Over the Office o...,Sources tell WIRED that OPM’s top layers of ma...,https://www.wired.com/story/elon-musk-lackeys-...,https://media.wired.com/photos/67993c2aeed73a2...,2025-01-28T23:01:31Z,Among the new highers-up at OPM is Noah Peters...,3
4,"{'id': None, 'name': 'NPR'}",The Associated Press,Greenland bans foreign political donations as ...,"The bill is aimed at protecting ""Greenland's p...",https://www.npr.org/2025/02/05/g-s1-46534/gree...,https://npr.brightspotcdn.com/dims3/default/st...,2025-02-05T06:12:25Z,"NUUK, Greenland Greenland's parliament passed ...",4
