# Download images from URLs

**Important**: This notebook should be run after `create-temp-data.ipynb` if the full year of data needs to be analyzed.

The notebook takes a json as input and uses the `requests` module to download the header images for each article. In case of very large datasets, the process can take a long time and use quite some space on the local machine (up to xxx GB). The images are saved into a sub-folder of `/data` called `images`. The folder has been added to the `.gitignore`.

In [None]:
%pip install pandas requests

In [5]:
import requests
import imghdr
import requests.exceptions
import pandas as pd
import re
import os
from os import listdir
from os.path import isfile, join

In [None]:
input_filename= '../../input-data/temp-data.json'

In [None]:
input_dataset = pd.read_csv(input_filename)
input_dataset.head(2)


Unnamed: 0.1,Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,headline,...,pub_date,document_type,news_desk,section_name,byline,type_of_material,_id,word_count,uri,subsection_name
0,0,Wrestling with age and a case of idea theft.,https://www.nytimes.com/2024/09/01/business/he...,Wrestling with age and a case of idea theft.,"Send questions about the office, money, career...",BU,3.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Help! I’m ‘Older’ and on the Job Hun...,...,2024-09-01T04:01:07+0000,article,SundayBusiness,Business Day,"{'original': 'By Anna Holmes', 'person': [{'fi...",News,nyt://article/da8532bd-f9bd-5ca3-9e7e-afef6e9f...,1280,nyt://article/da8532bd-f9bd-5ca3-9e7e-afef6e9f...,
1,1,"Grueling shifts, abuse from the public and sub...",https://www.nytimes.com/2024/09/01/world/asia/...,"Grueling shifts, abuse from the public and sub...",Exhausted doctors resting in crowded on-call r...,A,4.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'Worked to the Bone, India’s Doctors ...",...,2024-09-01T04:01:25+0000,article,Foreign,World,{'original': 'By Anupreeta Das and Pragati K.B...,News,nyt://article/aeabc262-aeb0-5423-a7ac-8bb664cb...,1310,nyt://article/aeabc262-aeb0-5423-a7ac-8bb664cb...,Asia Pacific


In [3]:
len(input_dataset)

48696

In [4]:
def multireplace(string, replacements, ignore_case=False):
    """
    Given a string and a replacement map, it returns the replaced string.
    :param str string: string to execute replacements on
    :param dict replacements: replacement dictionary {value to find: value to replace}
    :param bool ignore_case: whether the match should be case insensitive
    :rtype: str
    """
    if not replacements:
        # Edge case that'd produce a funny regex and cause a KeyError
        return string
    
    # If case insensitive, we need to normalize the old string so that later a replacement
    # can be found. For instance with {"HEY": "lol"} we should match and find a replacement for "hey",
    # "HEY", "hEy", etc.
    if ignore_case:
        def normalize_old(s):
            return s.lower()

        re_mode = re.IGNORECASE

    else:
        def normalize_old(s):
            return s

        re_mode = 0

    replacements = {normalize_old(key): val for key, val in replacements.items()}
    
    # Place longer ones first to keep shorter substrings from matching where the longer ones should take place
    # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce
    # 'hey ABC' and not 'hey ABc'
    rep_sorted = sorted(replacements, key=len, reverse=True)
    rep_escaped = map(re.escape, rep_sorted)
    
    # Create a big OR regex that matches any of the substrings to replace
    pattern = re.compile("|".join(rep_escaped), re_mode)
    
    # For each match, look up the new string in the replacements, being the key the normalized old string
    return pattern.sub(lambda match: replacements[normalize_old(match.group(0))], string)

In [6]:
def download_images(urls, current_news_outlet):
    if not os.path.exists(f'../../data/images/{current_news_outlet}'):
        os.makedirs(f'../../data/image/{current_news_outlet}')
        for url in urls:
            filenameID = multireplace(url[0], {'nyt://article/': '', 'nyt://video/': '', 'nyt://interactive/': '', 'nyt://audio/': '', 'nyt://': ''})
            print(filenameID)
            try:
                if current_news_outlet == 'nytimes':
                    if url[1] != None:
                        response = requests.get("https://www.nytimes.com/" + url[1])
                else:
                    response = requests.get(url[1]).content
                    if response.status_code != 200:
                        print(f"Download of {url} has failed")
                        exit()
                extension = imghdr.what(file=None, h=response.content)
                print(filenameID, extension)     
                filename = f'../../data/images/{current_news_outlet}/{current_news_outlet}-{url[0]}.{extension}'
                with open(filename, 'wb') as file:
                        file.write(response.content)
            except requests.exceptions.MissingSchema:
                    print('URL is not complete')
        
    
    print(f'Download of {current_news_outlet} successful')

In [None]:
#  Loading the data in the input folder
list_of_urls = list(zip(*map(input_dataset.get, ['uri', 'image'])))
download_images(list_of_urls, "nytimes")