In [None]:
import os
import json

def read_json_files_from_directory(input_directory):
    """Reads all JSON files from a directory and returns a list of dictionaries.

    Args:
    input_directory: The path to the directory containing the JSON files.

    Returns:
    A list of dictionaries, where each dictionary represents the data from a JSON file.
    """

    data = []
    for filename in os.listdir(input_directory):
        if filename.endswith(".json"):
            filepath = os.path.join(input_directory, filename)
            with open(filepath, 'r') as f:
                for line in f:
                  try:
                    tweet = json.loads(line)
                    data.append(tweet)
                  except json.JSONDecodeError as e:
                    continue
                    # print(f"Error decoding JSON in file {filename}: {e}")
    return data

# Example usage:
input_directory = "2022-fukushima_filtered"  # Replace with your directory path
data = read_json_files_from_directory(input_directory)
print(len(data))
print(data[0])  # To see the output, run the code.

In [None]:
import os
import requests
from urllib.parse import urlparse

def download_images(data, output_directory):
    """Downloads images from 'unique_image_urls' and saves them to the output directory.

    Args:
        data: The list of dictionaries containing the data.
        output_directory: The path to the directory where images will be saved.
    """

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    success_count = 0
    error_count = 0

    for item in data:
        if 'unique_image_urls' in item:
            for image_url in item['unique_image_urls']:
                try:
                    response = requests.get(image_url, stream=True)
                    response.raise_for_status()  # Raise an exception for bad responses

                    # Extract filename from URL
                    parsed_url = urlparse(image_url)
                    filename = os.path.basename(parsed_url.path)

                    # Save image to output directory
                    filepath = os.path.join(output_directory, filename)
                    with open(filepath, 'wb') as f:
                        for chunk in response.iter_content(chunk_size=8192):
                            f.write(chunk)
                    success_count += 1
                    # print(f"Downloaded: {image_url} to {filepath}")  # To see the output, run the code.
                except requests.exceptions.RequestException as e:
                    error_count += 1
                    print(f"Error downloading {image_url}: {e}")


    print(f"Total images downloaded: {success_count}")
    print(f"Total errors: {error_count}")

# Example usage:
output_directory = "2019-ridgecrest_filtered_images"
download_images(data, output_directory)

In [None]:
import os
import requests
from urllib.parse import urlparse
import concurrent.futures

def download_image(image_url, output_directory):
    """Downloads a single image and saves it to the output directory.

    Args:
        image_url: The URL of the image to download.
        output_directory: The path to the directory where images will be saved.
    """
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Raise an exception for bad responses

        # Extract filename from URL
        parsed_url = urlparse(image_url)
        filename = os.path.basename(parsed_url.path)

        # Save image to output directory
        filepath = os.path.join(output_directory, filename)
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        # print(f"Downloaded: {image_url} to {filepath}")  # To see the output, run the code.
        return True  # Indicate success
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {image_url}: {e}")
        return False  # Indicate failure


def download_images_multithreaded(data, output_directory, max_workers=5):
    """Downloads images from 'unique_image_urls' using multi-threading.

    Args:
        data: The list of dictionaries containing the data.
        output_directory: The path to the directory where images will be saved.
        max_workers: The maximum number of worker threads to use.
    """

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    success_count = 0
    error_count = 0

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for item in data:
            if 'unique_image_urls' in item:
                for image_url in item['unique_image_urls']:
                    future = executor.submit(download_image, image_url, output_directory)
                    futures.append(future)

        for future in concurrent.futures.as_completed(futures):
            if future.result():  # Check if download was successful
                success_count += 1
            else:
                error_count += 1

    print(f"Total images downloaded: {success_count}")
    print(f"Total errors: {error_count}")

# Example usage:
output_directory = "2022-fukushima_filtered_images"
download_images_multithreaded(data, output_directory)

In [None]:
import os

directory_path = "2022-fukushima_filtered_images"

# Get a list of all files and directories in the specified path
all_files = os.listdir(directory_path)

# Filter out directories and only count files
file_count = len([f for f in all_files if os.path.isfile(os.path.join(directory_path, f))])

print(f"The number of files in '{directory_path}' is: {file_count}")