In [1]:
import json

with open('collected_data.json') as f:
    json_dict = json.load(f)

In [13]:
image_links = []
for r in json_dict['review']:
    link_photo = r['profile_image_link']
    image_links.append(link_photo)


In [3]:
import requests
import os
import random
import time
from urllib.parse import urlparse
import shutil

def copy_default_image(default_image_path, save_path):
    """
    Copies the default logo image to the destination path.
    
    Args:
        default_image_path (str): Path to the default logo image
        save_path (str): Destination path for the copied image
        
    Returns:
        bool: True if successful, False if failed
    """
    try:
        shutil.copy2(default_image_path, save_path)
        print(f"Used default logo image instead")
        return True
    except Exception as e:
        print(f"Error copying default image: {str(e)}")
        return False

def download_image(url, save_folder, default_image_path):
    """
    Downloads an image from a URL and saves it to the specified folder.
    If download fails, uses the default logo image instead.
    
    Args:
        url (str): URL of the image to download
        save_folder (str): Path to the folder where images will be saved
        default_image_path (str): Path to the default logo image
    
    Returns:
        bool: True if successful (either download or default image), False if all failed
    """
    try:
        # Create the save folder if it doesn't exist
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
            
        # Get the filename from the URL
        filename = os.path.basename(urlparse(url).path)+".jpg"
            
        # Full path for saving the image
        save_path = os.path.join(save_folder, filename)
        
        # Check if file already exists
        if os.path.exists(save_path):
            print(f"Skipping {filename} - File already exists")
            return True
            
        # Try to download the image
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()  # Raise an exception for bad status codes
            
            # Save the image
            with open(save_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        
            print(f"Successfully downloaded: {filename}")
            return True
            
        except Exception as download_error:
            print(f"Error downloading image from {url}: {str(download_error)}")
            print("Using default logo instead...")
            return copy_default_image(default_image_path, save_path)
            
    except Exception as e:
        print(f"Unexpected error: {str(e)}")
        return False

def batch_download_images(urls, save_folder, default_image_path):
    """
    Downloads multiple images with random delays between downloads.
    Uses default logo image when downloads fail.
    
    Args:
        urls (list): List of image URLs to download
        save_folder (str): Path to the folder where images will be saved
        default_image_path (str): Path to the default logo image
    """
    # Verify default image exists
    if not os.path.exists(default_image_path):
        raise FileNotFoundError(f"Default image not found at: {default_image_path}")
    
    # Keep track of downloads and skips
    total_urls = len(urls)
    downloaded = 0
    skipped = 0
    default_used = 0
    
    for i, url in enumerate(urls, 1):
        print(f"\nProcessing image {i} of {total_urls}")
        
        # Check if the file exists before downloading
        filename = os.path.basename(urlparse(url).path)+".jpg"
        
        save_path = os.path.join(save_folder, filename)
        
        if os.path.exists(save_path):
            print(f"Skipping {filename} - File already exists")
            skipped += 1
        else:
            # Try to download the image
            result = download_image(url, save_folder, default_image_path)
            if result:
                if os.path.getsize(save_path) == os.path.getsize(default_image_path):
                    default_used += 1
                else:
                    downloaded += 1
            
            # Random delay between 1 and 2 seconds (skip delay for last image)
            if i < total_urls:
                delay = random.uniform(1, 2)
                print(f"Waiting {delay:.2f} seconds...")
                time.sleep(delay)
    
    # Print summary
    print(f"\nDownload Summary:")
    print(f"Total URLs processed: {total_urls}")
    print(f"Successfully downloaded: {downloaded}")
    print(f"Default logo used: {default_used}")
    print(f"Skipped (already existed): {skipped}")
    print(f"Failed: {total_urls - downloaded - skipped - default_used}")

In [4]:


# Specify your save folder
save_folder = "reviews_images_avatars"

# Download the images
batch_download_images(image_links, save_folder, "avatar_placeholder.jpg")


Processing image 1 of 20
Skipping 1700444650829.jpg - File already exists

Processing image 2 of 20
Skipping 1517286449740.jpg - File already exists

Processing image 3 of 20
Skipping 1517019548162.jpg - File already exists

Processing image 4 of 20
Skipping 1549211827567.jpg - File already exists

Processing image 5 of 20
Skipping 1529332164858.jpg - File already exists

Processing image 6 of 20
Skipping 1721652770867.jpg - File already exists

Processing image 7 of 20
Error downloading image from https://media.licdn.com/dms/image/v2/D4E03AQHhA1NMl9r1_A/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1698670021775?e=1736985600&v=beta&t=pW0L9VLPufUGVLEFRr0uQTWCCy2IawxFmI-MCRnfw2: 403 Client Error: Forbidden for url: https://media.licdn.com/dms/image/v2/D4E03AQHhA1NMl9r1_A/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1698670021775?e=1736985600&v=beta&t=pW0L9VLPufUGVLEFRr0uQTWCCy2IawxFmI-MCRnfw2
Using default logo instead...
Used de

In [5]:
from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(save_folder) if isfile(join(save_folder, f))]

In [14]:
image_dict = {}

for f in onlyfiles:
    id = f .replace(".jpg", "")
    new_path_f = save_folder + "/" + f
    for p in image_links:
        if id in p:
            image_dict[p] = new_path_f
            break

In [17]:
len(image_dict) == len(image_links)

True

In [18]:
for i in range(len(json_dict['review'])):
    old_value = json_dict['review'][i]['profile_image_link']
    json_dict['review'][i]['profile_image_link'] = image_dict[old_value]

In [3]:
with open('collected_data.json', 'w', encoding='utf-8') as outfile:
    json.dump(json_dict, outfile, ensure_ascii=False, indent=4)