In [14]:
import json
from urllib.parse import urlparse

In [15]:
def extract_domains(data):
    """
    Extract unique source domains from article URLs in the data
    """
    domains = set()
    for item in data:
        url = item.get('article_url', '')
        if url:
            domain = urlparse(url).netloc
            # Remove 'www.' prefix if present
            domain = domain.replace('www.', '')
            domains.add(domain)
    
    return sorted(list(domains))

In [16]:
with open("data/public_test_acm.json", 'r', encoding='utf-8') as file:
    data = json.load(file)

FileNotFoundError: [Errno 2] No such file or directory: 'data/public_test_acm.json'

In [5]:
domains = extract_domains(data)

In [6]:
domains

['',
 'abc30.com',
 'arabnews.com',
 'arkansasonline.com',
 'bbc.com',
 'bloomberg.com',
 'cbc.ca',
 'chinese.uhrp.org',
 'cn.nytimes.com',
 'cnnphilippines.com',
 'commons.wikimedia.org',
 'cpr.org',
 'denverpost.com',
 'dvidshub.net',
 'edition.cnn.com',
 'en.wikipedia.org',
 'english.aawsat.com',
 'georgeherald.com',
 'gettyimages.de',
 'gettyimages.fi',
 'google.com',
 'irishnews.com',
 'lonelyplanet.com',
 'm.lasvegassun.com',
 'military.com',
 'mirror.co.uk',
 'mprnews.org',
 'nbcnews.com',
 'news.cgtn.com',
 'npr.org',
 'nytimes.com',
 'phys.org',
 'reuters.com',
 'snopes.com',
 'sputniknews.com',
 'theconversation.com',
 'theguardian.com',
 'theknow.denverpost.com',
 'thescottishsun.co.uk',
 'voanews.com',
 'washingtonpost.com',
 'wbur.org',
 'youwantedalist.com']

In [None]:
# Import necessary libraries
import os
import json
import requests
import shutil
import imghdr
from PIL import Image
from tqdm import tqdm
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# The function you provided for downloading images
def download_and_save_image(image_url, save_folder_path, file_name):
    try:
        response = requests.get(image_url, stream=True, timeout=(60, 60))
        if response.status_code == 200:
            response.raw.decode_content = True
            image_path = os.path.join(save_folder_path, file_name)
            with open(image_path, 'wb') as f:
                shutil.copyfileobj(response.raw, f)
            if imghdr.what(image_path) and imghdr.what(image_path).lower() == 'png':
                img_fix = Image.open(image_path)
                img_fix.convert('RGB').save(image_path)
            return 1 
        else:
            print(f"Failed to download {image_url}: HTTP status {response.status_code}")
            return 0
    except Exception as e:
        print(f"Error downloading {image_url}: {str(e)}")
        return 0

# Function to get the filename from html_path and change extension to jpg
def get_image_filename(html_path):
    if not html_path:
        return None
    
    # Extract the base filename from the html_path
    base_name = os.path.basename(html_path)
    # Change extension to jpg
    if '.' in base_name:
        image_filename = os.path.splitext(base_name)[0] + '.jpg'
    else:
        image_filename = base_name + '.jpg'
    
    return image_filename

# Process a single inverse annotation file
def update_inverse_file(inverse_file_path):
    try:
        # Read the inverse annotation file
        with open(inverse_file_path, 'r', encoding='utf-8') as f:
            inverse_data = json.load(f)
        
        # Get the folder where the inverse file is located
        folder_path = os.path.dirname(inverse_file_path)
        
        # Track if we made any changes
        changes_made = False
        download_tasks = []
        
        # Process all_matched_captions
        if "all_matched_captions" in inverse_data:
            for i, caption in enumerate(inverse_data["all_matched_captions"]):
                if "image_link" in caption and "image_path" not in item:
                    # Get filename from html_path or use default
                    image_filename = get_image_filename(caption.get("html_path"))
                    
                    if not image_filename:
                        image_filename =  f"{i}.jpg"
                    
                    # Add to download tasks
                    download_tasks.append({
                        "url": caption["image_link"],
                        "folder": folder_path,
                        "filename": image_filename,
                        "entry": caption
                    })
        
        # Process matched_no_text
        if "matched_no_text" in inverse_data:
            for i, item in enumerate(inverse_data["matched_no_text"]):
                if "image_link" in item and "image_path" not in item:
                    # Get filename from html_path or use default
                    image_filename = get_image_filename(item.get("html_path")) or f"no_text_{i}.jpg"
                    
                    # Add to download tasks
                    download_tasks.append({
                        "url": item["image_link"],
                        "folder": folder_path,
                        "filename": image_filename,
                        "entry": item
                    })
        
        # Download images and update entries
        for task in download_tasks:
            success = download_and_save_image(task["url"], task["folder"], task["filename"])
            if success:
                task["entry"]["image_path"] = os.path.join(task["folder"], task["filename"])
                changes_made = True
        
        # Save the updated inverse file if changes were made
        if changes_made:
            with open(inverse_file_path, 'w', encoding='utf-8') as f:
                print(inverse_data)
                json.dump(inverse_data, f, indent=4, ensure_ascii=False)
            return True
        
        return False
    
    except Exception as e:
        logger.error(f"Error processing file {inverse_file_path}: {str(e)}")
        return False

# Find all folders in test directory
def get_test_folders(test_dir):
    return [os.path.join(test_dir, folder) for folder in os.listdir(test_dir) 
            if os.path.isdir(os.path.join(test_dir, folder))]

# Main execution cell
# Define the base test directory
test_dir = "queries_dataset/merged_balanced/inverse_search/test"

# Get all folders in the test directory
test_folders = get_test_folders(test_dir)
print(f"Found {len(test_folders)} folders to process")

# Process each folder
success_count = 0
fail_count = 0

for folder in tqdm(test_folders, desc="Processing folders"):
    # Look for inverse_annotation.json in the folder
    inverse_file = os.path.join(folder, "inverse_annotation.json")
    print(inverse_file)
    if os.path.exists(inverse_file):
        result = update_inverse_file(inverse_file)
        if result:
            success_count += 1
        else:
            fail_count += 1
    else:
        print(f"No inverse_annotation.json found in {folder}")
        fail_count += 1
    
    # if success_count == 2: 
    #     break
print(f"\nProcessing complete!")
print(f"Successfully updated: {success_count} folders")
print(f"Failed or no changes needed: {fail_count} folders")

Found 4054 folders to process


Processing folders:   0%|          | 0/4054 [00:00<?, ?it/s]

queries_dataset/merged_balanced/inverse_search/test\0\inverse_annotation.json
queries_dataset/merged_balanced/inverse_search/test\1\inverse_annotation.json
queries_dataset/merged_balanced/inverse_search/test\10\inverse_annotation.json
queries_dataset/merged_balanced/inverse_search/test\100\inverse_annotation.json


Processing folders:   0%|          | 4/4054 [00:01<30:13,  2.23it/s]

{'entities': ['Delicious Deli', 'Car', 'Traffic', 'Luxury car', 'Motor vehicle', 'Sport utility vehicle', 'Van', 'Street', 'Compact car', 'Pedestrian', 'Parking', 'Transport', 'Road', 'Clydebank', 'asphalt'], 'all_matched_captions': [{'page_link': 'https://www.bbc.com/news/uk-scotland-glasgow-west-35892821', 'image_link': 'https://ichef.bbci.co.uk/news/640/cpsprodpb/AB74/production/_88929834_paige_deligood.jpg', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\100\\0.txt', 'title': 'Paige Doherty death: Man arrested in Clydebank area - BBC News', 'content': 'Paige Doherty was last seen near the Delicious Deli in Clydebank on Saturday morning\n\nA man has been arrested in connection with the death of 15-year-old Paige Doherty whose body was found in a wooded area in Clydebank.\n\nThe teenager was last seen near a deli on Saturday morning after setting off for her part-time hairdressing job.\n\nHer body was found just off Great Western Road at lunchtime on Monday.\n\

Processing folders:   0%|          | 6/4054 [00:18<4:09:05,  3.69s/it]

Error downloading https://www.reshareit.com/wp-content/uploads/man-who-passed-obscene-comments-e1440392533217.jpg: Exceeded 30 redirects.
{'entities': ['Jasleen Kaur harassment controversy', 'Eve teasing', 'Social media', 'Harassment', 'Abuse', 'Media', 'Sonakshi Sinha', 'car'], 'all_matched_captions': [{'page_link': 'https://www.scoopwhoop.com/inothernews/sonakshi-sinha-apology-accused-jasleen-kaur-case/', 'image_link': 'https://image.scoopwhoop.com/w360/s3.scoopwhoop.com/anj/41587503.jpg.webp', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1001\\10.txt', 'title': 'Sonakshi Sinha Tweets An Apology To The Accused In The Jasleen Kaur Case - ScoopWhoop', 'content': 'During the past one week, social media has been abuzz with controversy surrounding the Jasleen Kaur case, where she was allegedly molested by Sarvjeet Singh at a traffic light in Delhi. We saw furore on Facebook over Jasleen’s post where she claimed that she had been molested and thereafter threatened 

Processing folders:   0%|          | 8/4054 [00:20<3:09:45,  2.81s/it]

{'entities': ['Switzerland', 'Tourism', 'Berggasthaus Ascher', 'Hotel', 'Nuitee', 'Metropolitan area', 'Der Handschlag', 'A.N. Watson', 'vacation', 'City', 'metropolitan area'], 'all_matched_captions': [], 'matched_no_text': [{'page_link': 'https://amp.cincinnati.com/amp/28263615', 'image_link': 'https://www.gannett-cdn.com/media/2015/04/23/USATODAY/USATODAY/635654017152427902-EPA-USA-MIAMI-WEATHER.jpg', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1003\\0.txt', 'title': '', 'content': '', 'image_path': 'queries_dataset/merged_balanced/inverse_search/test\\1003\\0.jpg'}, {'page_link': 'https://www.gannett-cdn.com/sitemaps/USAT/web/web-sitemap-2015-05.xml', 'image_link': 'https://www.gannett-cdn.com/media/2015/04/23/USATODAY/USATODAY/635654017152427902-EPA-USA-MIAMI-WEATHER.jpg', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1003\\1.txt', 'title': '', 'content': '', 'image_path': 'queries_dataset/merged_balanced/inverse_search/test\\1003\

Processing folders:   0%|          | 9/4054 [00:21<2:39:09,  2.36s/it]

{'entities': ['Army officer', 'Military uniform', 'Prime minister', 'Human rights', 'Veteran', 'State visit', 'Minister', 'BBC News', 'Spokesperson', 'official'], 'all_matched_captions': [{'page_link': 'https://www.bbc.com/news/uk-31716270', 'image_link': 'https://ichef.bbci.co.uk/news/640/mcs/media/images/81371000/jpg/_81371001_4132af17-ca87-4732-b88d-5f8d5e21f400.jpg', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1004\\0.txt', 'title': 'Mexican president begins UK state visit - BBC News', 'content': 'Mexican president Enrique Pena Nieto and David Cameron will speak about human rights during the visit\n\nThe Queen has welcomed the president of Mexico to the UK at the beginning of a three-day state visit.\n\nEnrique Pena Nieto and his wife, Angelica Rivera, joined the monarch, the Duke of Edinburgh and Prime Minister David Cameron in London.\n\nAs Mr Pena Nieto and Ms Rivera arrived a royal salute of 41 guns was fired by the King\'s Troop Royal Horse Artillery.

Processing folders:   0%|          | 10/4054 [00:22<2:09:09,  1.92s/it]

{'entities': ['Mary Rose Museum', 'Mary Rose', 'Army officer', 'Museum', 'HMS Duncan', 'Crew', 'House of Tudor', 'Veteran', 'Bodyguard', 'Price', 'United States Naval Sea Cadet Corps', 'Portsmouth', 'Prince Philip, Duke of Edinburgh', 'official'], 'all_matched_captions': [{'page_link': 'https://www.bbc.com/news/uk-england-hampshire-22691645', 'image_link': 'https://ichef.bbci.co.uk/news/464/mcs/media/images/67900000/jpg/_67900425_hi018159601.jpg', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1005\\0.txt', 'title': 'Mary Rose museum opens in Portsmouth at cost of £35m - BBC News', 'content': 'A volley of flaming arrows was fired from Southsea Castle during the museum\'s opening-day events\n\nRoyal Navy Sea Cadets lowered the Tudor standard to mark the museum\'s official opening\n\nCrew members from HMS Duncan carry the original ship\'s bell to the The Mary Rose Museum during the opening ceremony\n\nCrew from HMS Duncan laid a wreath at the wreck site of the Mary

Processing folders:   0%|          | 11/4054 [00:26<2:52:08,  2.55s/it]

Failed to download https://cdn.prod.www.spiegel.de/images/669ea8c8-0001-0004-0000-000000860671_w180_r1.33_fpx35.68_fpy50.jpg: HTTP status 404
queries_dataset/merged_balanced/inverse_search/test\1007\inverse_annotation.json
queries_dataset/merged_balanced/inverse_search/test\1008\inverse_annotation.json


Processing folders:   0%|          | 13/4054 [00:29<2:18:00,  2.05s/it]

queries_dataset/merged_balanced/inverse_search/test\1009\inverse_annotation.json
queries_dataset/merged_balanced/inverse_search/test\101\inverse_annotation.json
queries_dataset/merged_balanced/inverse_search/test\1010\inverse_annotation.json
queries_dataset/merged_balanced/inverse_search/test\1011\inverse_annotation.json
queries_dataset/merged_balanced/inverse_search/test\1012\inverse_annotation.json
Failed to download http://upload.wikimedia.org/wikipedia/commons/6/6e/Siphonocryptus_zigzag_holotype.jpg: HTTP status 403
Failed to download http://upload.wikimedia.org/wikipedia/commons/6/6e/Siphonocryptus_zigzag_holotype.jpg: HTTP status 403
Failed to download http://upload.wikimedia.org/wikipedia/commons/6/6e/Siphonocryptus_zigzag_holotype.jpg: HTTP status 403
Failed to download https://upload.wikimedia.org/wikipedia/commons/6/6e/Siphonocryptus_zigzag_holotype.jpg: HTTP status 403


Processing folders:   0%|          | 18/4054 [00:48<3:31:21,  3.14s/it]

Failed to download https://upload.wikimedia.org/wikipedia/commons/thumb/6/6e/Siphonocryptus_zigzag_holotype.jpg/240px-Siphonocryptus_zigzag_holotype.jpg: HTTP status 403
{'entities': ['Siphonocryptida', 'Order', 'Colobognatha', 'Taxonomy', 'Polyzoniida', 'Platydesmida', 'Siphonophorida', 'Siphoniulus', 'Class', 'Millipedes', 'Myriapoda', 'Arthropod', 'Helminthomorpha', 'Chilognatha', 'Ecdysozoa', 'orden siphonocryptida'], 'all_matched_captions': [{'page_link': 'https://www.theguardian.com/science/2011/may/08/siphonocryptus-zigzag-new-to-nature', 'image_link': 'https://i.guim.co.uk/img/static/sys-images/Observer/Columnist/Columnists/2011/5/4/1304526228350/zigzag-007.jpg?width=300&quality=45&auto=format&fit=max&dpr=2&s=1f767aa5740f25865bea8045338dc3a7', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1012\\3.txt', 'title': 'New to Nature No 41: Siphonocryptus zigzag | Zoology | The Guardian', 'content': 'Siphonocryptus zigzag is an unusual and beautifully coloured m

Processing folders:   0%|          | 19/4054 [01:07<6:03:37,  5.41s/it]

{'entities': ['Wallpaper', 'Image', 'Mammal', 'Marine mammal', 'Walrus attack', 'Walruses', 'Photograph', 'Tusk', 'Pacific walrus', 'Hauling-out', 'Photography', 'Walrus', 'female walrus'], 'all_matched_captions': [{'page_link': 'https://www.usgs.gov/media/images/walrus-female-and-calf-close-side', 'image_link': 'https://prd-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/styles/atom_page_medium/public/walrus_pup_side_2010_Norseman_Sarah_Sonsthagen.jpg', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1013\\0.txt', 'title': 'Walrus Female And Calf Up Close From The Side | U.S. Geological Survey', 'content': 'Walruses in the Chukchi Sea during a tagging survey onboard the Norseman II in June 2010.\n\nPublic Domain.', 'caption': {'alt_node': 'Walrus Female And calf Up Close From The Side'}}, {'page_link': 'https://www.livescience.com/15664-walrus-photos-arctic-sea.html', 'image_link': 'https://cdn.mos.cms.futurecdn.net/9xBtDubWvwFvUshDXygPh8-

Processing folders:   1%|          | 21/4054 [01:08<4:29:41,  4.01s/it]

{'entities': ['Hightown Barracks', 'Royal Welsh', 'Regiment', 'Soldier', 'Infantry', 'Battalion', 'Military rank', 'Military police', 'Non-commissioned officer', 'Military organization', 'Troop', 'Fusilier', 'Marines', 'army'], 'all_matched_captions': [{'page_link': 'https://www.bbc.co.uk/news/av/uk-wales-north-east-wales-23477394/royal-welsh-regiment-s-parade-through-flint', 'image_link': 'https://ichef.bbci.co.uk/news/1024/media/images/68981000/jpg/_68981332_regiment512.jpg', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1015\\1.txt', 'title': "Royal Welsh regiment's parade through Flint - BBC News", 'content': 'This video can not be played\n\nFlintshire Council has marked the fourth anniversary of granting the freedom of the county to the regiment of the Royal Welsh with a celebratory parade through Flint.\n\nCrowds turned out to applaud the regiment, led by the volunteer band of the Mercian Regiment, through the streets from the Royal British Legion.\n\nIt a

Processing folders:   1%|          | 22/4054 [01:09<3:52:07,  3.45s/it]

Failed to download https://i0.wp.com/s1.ibtimes.com/sites/www.ibtimes.com/files/2015/11/15/gettyimages-497133846.jpg?w=215: HTTP status 400
queries_dataset/merged_balanced/inverse_search/test\1017\inverse_annotation.json


Processing folders:   1%|          | 23/4054 [01:12<3:47:53,  3.39s/it]

queries_dataset/merged_balanced/inverse_search/test\1018\inverse_annotation.json


Processing folders:   1%|          | 24/4054 [01:13<3:12:59,  2.87s/it]

{'entities': ['Canal', 'River', 'Watercourse', 'Water resources', 'Wetland', 'Towpath', 'Bayou', 'Reservoir', 'Floodplain', 'Water', 'Lough', 'Detective', 'Pond', 'Fire', 'river'], 'all_matched_captions': [{'page_link': 'https://www.bbc.com/news/uk-england-lancashire-32188627', 'image_link': 'https://ichef.bbci.co.uk/news/640/mcs/media/images/82135000/jpg/_82135689_1canal.jpg', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1018\\0.txt', 'title': "Blackburn towpath man's fire death investigated - BBC News", 'content': 'The man was found on fire on a canal towpath on Bolton Road in Blackburn\n\nA man has died after being found on fire on a canal towpath in Blackburn.\n\nThe victim died close to the former Moorings pub on Bolton Road, on Saturday afternoon.\n\nPolice and firefighters are both investigating his death, which Lancashire Police said was being treated as "unexplained".\n\nThe man, who was believed to be in his late 50s, was confirmed dead at the scene, 

Processing folders:   1%|          | 26/4054 [01:14<2:09:15,  1.93s/it]

{'entities': ['Chicken', 'Poultry', 'Chicken', 'Duck', 'Fresh Chicken', 'Cooking', 'Animal source foods', 'Juicer', 'Vegetarianism', 'meat'], 'all_matched_captions': [], 'matched_no_text': [{'page_link': 'https://www.forbes.com/sites/judystone/2015/12/10/new-superbug-resistant-to-all-antibiotics-linked-to-imported-meat/', 'image_link': 'https://thumbor.forbes.com/thumbor/fit-in/1200x0/filters%3Aformat%28jpg%29/https%3A%2F%2Fspecials-images.forbesimg.com%2Fimageserve%2Fe937372bcfe1da11af9f0014c2589dfb%2F0x0.jpg%3Ffit%3Dscale', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\102\\0.txt', 'title': 'New Superbug Resistant To All Antibiotics Linked To Imported Meat', 'content': 'ByJudy Stone\n\n, Senior Contributor.\n\nWe’re one giant step closer to the end of antibiotics.\n\nWorkers prepare chickens at the AIA factory in San Martino (AP Photo/Antonio Calanni)\n\nJust last month, Yi-Yun Liu’s team discovered the mcr-1 gene, which conveys resistance to colistin, an anti

Processing folders:   1%|          | 28/4054 [01:15<1:33:22,  1.39s/it]

{'entities': ['Brickland Court', 'Public utility', 'Crime scene', 'Construction worker', 'Person accused of a crime', 'Crime', 'Service', 'Stabbing', 'Labourer', 'Edmonton', 'Edmonton Green', 'London', 'security'], 'all_matched_captions': [{'page_link': 'https://www.theguardian.com/uk-news/2014/dec/15/four-boys-arrested-man-stabbed-to-death-edmonton-london', 'image_link': 'https://i.guim.co.uk/img/static/sys-images/Guardian/Pix/pictures/2014/12/15/1418647426839/Crime-scene-where-52-year-010.jpg?width=445&quality=45&auto=format&fit=max&dpr=2&s=d6111cd0842c4d2aae5c0da2802630c0', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1021\\0.txt', 'title': 'North London stabbing: five boys arrested after row over access to party\r\n | London | The Guardian', 'content': "Five boys aged 13 and 14 were arrested on suspicion of murder after a 52-year-old man was stabbed to death in north London when a row over access to a party “escalated into a shocking act of violence”.\n\nPo

Processing folders:   1%|          | 31/4054 [01:21<1:53:22,  1.69s/it]

{'entities': ['Syria', 'Makhmur', 'Ramadi', 'Afghanistan', 'Battle of Mosul', 'Fall of Mosul', 'Libya', 'Radio Free Europe', 'Kurds', 'Iraq', 'wall'], 'all_matched_captions': [{'page_link': 'https://www.dailystar.co.uk/pics/pictures/gallery/isis-run-fall-mosul-death-18679995', 'image_link': 'https://i2-prod.dailystar.co.uk/incoming/article20475502.ece/ALTERNATES/s1227b/httpscdnimagesdailystarcoukdynamic122photos587000900x738417587', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1024\\0.txt', 'title': 'ISIS on the run! With the fall of Mosul, is the death cult in retreat? - Daily Star', 'content': '', 'caption': {'caption_node': 'Iraqi soldiers patrol a suburb close to Jweibah, east of the city of Ramadi, 4 Feb 2016', 'alt_node': 'Iraqi soldiers patrol a suburb close to Jweibah, east of the city of Ramadi, 4 Feb 2016'}, 'image_path': 'queries_dataset/merged_balanced/inverse_search/test\\1024\\0.jpg'}, {'page_link': 'https://www.azathabar.com/a/photos-week-5-2016/

Processing folders:   1%|          | 32/4054 [01:36<4:25:38,  3.96s/it]

Failed to download https://www.voanews.com/s3/files/styles/google_amp_1280x720/s3/2019-04/DF111F58-0545-42D6-BF31-54EB8DE09E52.jpg?itok=gNhFCxL0: HTTP status 404
queries_dataset/merged_balanced/inverse_search/test\1026\inverse_annotation.json


Processing folders:   1%|          | 33/4054 [01:40<4:30:52,  4.04s/it]

{'entities': ['Nairobi', 'Foreign minister', 'Kenya Defence Forces', 'Shopping Centre', 'USA Today', 'Islamic extremism', 'Interior minister', 'Hostage', 'Minister', 'Monday', 'Kenya', 'supermarket'], 'all_matched_captions': [], 'matched_no_text': [{'page_link': 'https://www.usatoday.com/videos/embed/2853175/?placement=mobileweb-amp&cst=news%2Fnational&ssts=news%2Fnation&series=&keywords=Barack+Obama%2CSt.+Louis+Cardinals%2CUnited+States%2CMike+Matheny%2CKenya%2CU.S.+Navy%2CChina', 'image_link': 'http://videos.usatoday.net/Brightcove2/29906170001/2013/09/29906170001_2685667653001_thumb-b717dbd41cb32f203e0f4a5047008483.jpg?pubId=29906170001', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1026\\4.txt', 'title': 'Raw: Video shows early moments of Kenya attack', 'content': '', 'image_path': 'queries_dataset/merged_balanced/inverse_search/test\\1026\\4.jpg'}, {'page_link': 'https://www.usatoday.com/videos/news/world/2013/09/23/2853175/', 'image_link': 'http://videos.

Processing folders:   1%|          | 36/4054 [01:49<3:57:11,  3.54s/it]

{'entities': ['Channel Master CM-4001HDBW Flatenna Duo 35 Indoor Antenna', 'Antenna', 'Channel Master', 'TV Aerial', 'Channel Master', 'Indoor TV Aerial', 'Channel Master Flatenna Duo Ultra-Thin Indoor TV Antenna 35 Mile Range -...', 'Channel Master CM-4001HD', 'Cable television', 'Terrestrial television', 'High-definition television', 'Indoor antenna', 'Digital Video Recorder', 'Computer', 'computer accessory'], 'all_matched_captions': [], 'matched_no_text': [{'page_link': 'http://soundadvicenews.com/category/antennas/', 'image_link': 'http://soundadvicenews.com/wp-content/uploads/2016/07/FLATennaF.jpg', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1029\\2.txt', 'title': 'Security Verification', 'content': '', 'image_path': 'queries_dataset/merged_balanced/inverse_search/test\\1029\\2.jpg'}, {'page_link': 'https://theofy.world/gallery/flatenna-35', 'image_link': 'https://media.marketwire.com/attachments/201408/MOD-270664_FLATenna.jpg', 'html_path': 'queries_da

Processing folders:   1%|          | 41/4054 [01:51<2:08:59,  1.93s/it]

queries_dataset/merged_balanced/inverse_search/test\1034\inverse_annotation.json
Failed to download https://popphoto.com/app/uploads/2019/01/17/VTCMB2LM2Z25FHAWLTNEHSIJFI-1024x683.jpg: HTTP status 404
Failed to download https://popphoto.com/app/uploads/2019/01/17/VTCMB2LM2Z25FHAWLTNEHSIJFI-1024x683.jpg: HTTP status 404
Failed to download https://popphoto.com/app/uploads/2019/01/17/VTCMB2LM2Z25FHAWLTNEHSIJFI-1024x683.jpg: HTTP status 404
Failed to download http://www.the-terrace.co.uk/wp-content/uploads/2015/01/4325543_orig.jpg: HTTP status 404
Error downloading https://www.ncptt.nps.gov/wp-content/uploads/Cherokee_suntree_canon_sm.png: HTTPSConnectionPool(host='www.ncptt.nps.gov', port=443): Max retries exceeded with url: /wp-content/uploads/Cherokee_suntree_canon_sm.png (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001FE1E8C8940>: Failed to resolve 'www.ncptt.nps.gov' ([Errno 11001] getaddrinfo failed)"))
Error downloading https://qigongconnection.

Processing folders:   1%|          | 42/4054 [02:10<4:30:00,  4.04s/it]

{'entities': ['Ken Lamb Tree Service, LLC', 'Tree', 'Light', 'Tree farm', 'Shade', 'Trunk', 'Candle', 'Weeping fig', 'Old-growth forest', 'Canopy', 'Heritage tree', 'Forest', 'Light pillar', 'Photography', 'Celtic sacred trees', 'tree and light'], 'all_matched_captions': [{'page_link': 'https://www.popphoto.com/news/2010/11/trunk-show-juan-pons/', 'image_link': 'https://popphoto.com/app/uploads/2019/01/17/VTCMB2LM2Z25FHAWLTNEHSIJFI-1024x683.jpg', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1034\\4.txt', 'title': 'Trunk Show: JUAN A. PONS | Popular Photography', 'content': 'Once isolated, a group of ficus trees now shade a seven-lane Puerto Rican highway.\n\nBy\n                              \n            Russell Hart / American Photo\n          \n                \n              |\n    \n\n    Published Nov 10, 2010 1:13 AM EST\n\nBased in North Carolina, Pons is co-founder of The Digital Photography Experience, an online digital learning center, and leads work

Processing folders:   1%|          | 43/4054 [02:31<7:34:19,  6.80s/it]

{'entities': ['Cherry blossom', 'National Cherry Blossom Festival', 'Sakura', 'Sakura', 'Image', 'Blossom', 'Sweet Cherry', 'Painting', 'Spring', 'Cherry', 'Cherry blossom'], 'all_matched_captions': [{'page_link': 'https://lat.rtrs.tv/foto/foto.php?id=32727&glry=2683', 'image_link': 'https://lat.rtrs.tv/_FOTO/glrz/0327/032727.jpg', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1035\\4.txt', 'title': 'RTRS', 'content': '', 'caption': {'alt_node': 'Јapanska trešnja u cvatu (Foto:Profimedia.rs)'}, 'image_path': 'queries_dataset/merged_balanced/inverse_search/test\\1035\\4.jpg'}, {'page_link': 'https://paintingvalley.com/famous-cherry-blossom-tree-painting', 'image_link': 'https://paintingvalley.com/images/famous-cherry-blossom-tree-painting-13.jpg', 'html_path': 'queries_dataset\\merged_balanced\\inverse_search\\test\\1035\\1.txt', 'title': 'Famous Cherry Blossom Tree Painting at PaintingValley.com | Explore collection of Famous Cherry Blossom Tree Painting', 'cont

Processing folders:   1%|          | 43/4054 [02:45<4:16:34,  3.84s/it]


KeyboardInterrupt: 

: 