<a href="https://colab.research.google.com/github/mrncstt/githubevents_pypsark/blob/main/github_pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=8dc9027edf7b94aabb3583c8581095fc9794c324d9da38c7458950b72bc7c6e5
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [6]:
!python -m pip install --upgrade pip

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.0


In [7]:
import os
import requests
import calendar
from zipfile import ZipFile
from datetime import datetime
from pathlib import Path
from requests.exceptions import RequestException, HTTPError, ConnectionError, Timeout
from tqdm import tqdm
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, to_date, when
from IPython.display import FileLink, display
import gzip
import json
import time
import py4j
import logging
from concurrent.futures import ThreadPoolExecutor


In [8]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [9]:
def create_directory(path):
    """
    Creates a directory if it doesn't exist.

    Parameters:
    path (str): The path of the directory to be created.

    Logs:
    - Info: If the directory is created successfully or already exists.
    - Error: If an error occurs during the directory creation process.
    """
    try:
        path_obj = Path(path)
        if not path_obj.exists():
            path_obj.mkdir(parents=True, exist_ok=True)
            logging.info(f"Directory '{path}' created successfully.")
        else:
            logging.info(f"Directory '{path}' already exists.")
    except Exception as e:
        logging.error(f"An error occurred while creating the directory '{path}': {e}")

In [10]:
def download_file(url, output_file):
    """
    Downloads a file from a URL to the specified output path.

    Parameters:
    url (str): The URL of the file to be downloaded.
    output_file (str): The path where the downloaded file will be saved.

    Logs:
    - Info: If the file is downloaded successfully.
    - Error: If an error occurs during the download process, including HTTP errors, connection errors, and timeouts.
    """
    try:
        with requests.get(url, stream=True, timeout=10) as response:
            response.raise_for_status()
            total_size = int(response.headers.get('content-length', 0))
            block_size = 1024
            t = tqdm(total=total_size, unit='iB', unit_scale=True)

            with open(output_file, 'wb') as f:
                for data in response.iter_content(block_size):
                    t.update(len(data))
                    f.write(data)
            t.close()

        logging.info(f"Downloaded - {url}\nPath - {output_file}")
    except (requests.HTTPError, requests.ConnectionError, requests.Timeout) as e:
        logging.error(f"Error occurred while downloading {url}: {e}")
    except requests.RequestException as e:
        logging.error(f"Request error occurred while downloading {url}: {e}")

In [11]:
def download_github_events(year, month, output_dir):
    """
    Downloads GitHub event data for the specified year and month to the given output directory.

    Parameters:
    year (int): The year of the event data to download.
    month (int): The month of the event data to download.
    output_dir (str): The directory where the downloaded data will be saved.

    Creates directories for each day and hour if they don't exist, and downloads the data files. Uses parallel processing to speed up the download process.
    """
    base_url = "https://data.gharchive.org/"
    create_directory(output_dir)

    num_days = calendar.monthrange(year, month)[1]

    def download_day_hour(day, hour):
        """Downloads the GitHub event data for a specific day and hour."""
        day_dir = os.path.join(output_dir, f"{year}-{month:02d}-{day:02d}")
        create_directory(day_dir)

        url = f"{base_url}{year}-{month:02d}-{day:02d}-{hour}.json.gz"
        output_file = os.path.join(day_dir, f"{year}-{month:02d}-{day:02d}-{hour}.json.gz")
        if not os.path.exists(output_file):
            logging.info(f"Downloading {url} to {output_file}")
            download_file(url, output_file)
        else:
            logging.info(f"File already exists: {output_file}")

    # Use ThreadPoolExecutor for parallel downloads
    with ThreadPoolExecutor(max_workers=2) as executor:
        for day in range(1, num_days + 1):
            for hour in range(24):
                executor.submit(download_day_hour, day, hour)

In [12]:
def verify_downloaded_files(year, month, output_dir):
    """
    Verifies that all expected files for a given month and year have been downloaded.

    This function checks for the presence of JSON files compressed with gzip (.json.gz)
    for every hour of every day in the specified month and year. It assumes the files are
    organized in a directory structure where each day has its own subdirectory named
    in the format 'YYYY-MM-DD', and each file is named in the format 'YYYY-MM-DD-HH.json.gz'.

    Parameters:
    year (int): The year for which to verify the files.
    month (int): The month for which to verify the files (1-12).
    output_dir (str): The base directory where the files are expected to be located.

    Prints:
    A list of missing files if any are not found, otherwise a confirmation message that all files are present.
    """
    missing_files = []
    num_days = calendar.monthrange(year, month)[1]

    for day in range(1, num_days + 1):
        for hour in range(24):
            expected_file = os.path.join(output_dir, f"{year}-{month:02d}-{day:02d}", f"{year}-{month:02d}-{day:02d}-{hour}.json.gz")
            if not os.path.exists(expected_file):
                missing_files.append(expected_file)

    if missing_files:
        print(f"Some files are missing ({len(missing_files)} files):")
        for file in missing_files:
            print(file)
    else:
        print("All files have been downloaded successfully.")


In [24]:
year = 2024
month = 5
output_dir = '/path/to/github_events/downloaded'
output_directory = '/path/to/github_events/processed'
zip_directory = '/path/to/github_events/zipped'

100%|██████████| 117M/117M [00:02<00:00, 52.1MiB/s]


In [13]:
download_github_events(year, month, output_dir)

100%|██████████| 105M/105M [00:04<00:00, 22.7MiB/s]
100%|██████████| 105M/105M [00:03<00:00, 27.3MiB/s]
100%|██████████| 95.8M/95.8M [00:02<00:00, 37.2MiB/s]
100%|██████████| 90.1M/90.1M [00:01<00:00, 48.1MiB/s]
100%|██████████| 85.4M/85.4M [00:02<00:00, 35.5MiB/s]
100%|██████████| 98.6M/98.6M [00:03<00:00, 30.5MiB/s]
100%|██████████| 89.7M/89.7M [00:02<00:00, 35.3MiB/s]
100%|██████████| 95.9M/95.9M [00:02<00:00, 36.5MiB/s]
100%|██████████| 109M/109M [00:03<00:00, 33.2MiB/s]
100%|██████████| 126M/126M [00:02<00:00, 47.6MiB/s]
100%|██████████| 111M/111M [00:01<00:00, 55.6MiB/s]
100%|██████████| 113M/113M [00:02<00:00, 38.5MiB/s]
100%|██████████| 117M/117M [00:02<00:00, 58.4MiB/s]
100%|██████████| 125M/125M [00:03<00:00, 40.2MiB/s]
100%|██████████| 139M/139M [00:02<00:00, 48.8MiB/s]
100%|██████████| 136M/136M [00:03<00:00, 44.9MiB/s]
100%|██████████| 135M/135M [00:02<00:00, 63.1MiB/s]
100%|██████████| 135M/135M [00:03<00:00, 43.9MiB/s]
100%|██████████| 127M/127M [00:02<00:00, 50.2MiB/s]


KeyboardInterrupt: 

In [14]:
def read_files(output_dir):
    """
    Reads all .json.gz files from the specified directory and groups them by day.

    This function traverses through the specified directory and its subdirectories,
    collects all .json.gz files, and groups them by their date, assuming that each subdirectory
    is named in the format 'YYYY-MM-DD'. It returns a dictionary where the keys are dates and the
    values are lists of file paths.

    Parameters:
    output_dir (str): The base directory where the files are expected to be located.

    Returns:
    dict: A dictionary where the keys are dates (str) and the values are lists of file paths (str).

    Prints:
    A message indicating the number of files processed per day. If the directory does not exist,
    it prints an error message.

    """
    if not os.path.exists(output_dir):
        print(f"Directory {output_dir} does not exist.")
        return {}

    files_by_day = {}

    for root, _, files in os.walk(output_dir):
        for file in files:
            if file.endswith(".json.gz"):
                try:
                    file_path = os.path.join(root, file)
                    file_date = root.split('/')[-1]
                    if file_date not in files_by_day:
                        files_by_day[file_date] = []
                    files_by_day[file_date].append(file_path)
                except IndexError:
                    print(f"Skipping file with unexpected format: {file_path}")

    for date, files in files_by_day.items():
        print(f"Date: {date}, Files processed: {len(files)}")

    return files_by_day


Date: 2024-05-01, Files processed: 24
Date: 2024-05-02, Files processed: 23


 41%|████      | 39.2M/96.2M [00:00<00:01, 42.8MiB/s]

In [15]:
files_by_day = read_files(output_dir)

Date: 2024-05-03, Files processed: 7
Date: 2024-05-01, Files processed: 24
Date: 2024-05-02, Files processed: 24


 66%|██████▌   | 59.6M/90.3M [00:01<00:00, 48.0MiB/s]

In [25]:
def is_valid_json(filepath):
    """
    Checks if a gzip-compressed JSON file is valid.

    This function attempts to open and read a gzip-compressed file line by line,
    parsing each line as JSON. If all lines can be successfully parsed, the file
    is considered valid. If any line fails to parse as JSON or if the file cannot
    be read due to an EOFError, the file is considered invalid.

    Parameters:
    filepath (str): The path to the gzip-compressed JSON file.

    Returns:
    bool: True if the file is valid JSON, False otherwise.

    """
    try:
        with gzip.open(filepath, 'rt', encoding='utf-8') as f:
            for line in f:
                json.loads(line)
        return True
    except (json.JSONDecodeError, EOFError):
        return False

In [26]:

def create_spark_session(max_retries=3, retry_delay=5):
    """
    Creates a Spark session with retry logic.

    This function attempts to create a Spark session using the given number of retries and delay between retries.
    If the Spark session cannot be created due to a Py4JNetworkError, it retries up to the specified maximum number of retries,
    waiting for the specified delay between each attempt. If it fails to create the Spark session after the specified number of retries,
    it raises an exception.

    Parameters:
    max_retries (int): The maximum number of retry attempts (default is 3).
    retry_delay (int): The delay in seconds between retry attempts (default is 5 seconds).

    Returns:
    SparkSession: A SparkSession object if the session is successfully created.

    Raises:
    Exception: If the Spark session cannot be created after the specified number of retries.

    """
    retries = 0
    while retries < max_retries:
        try:
            spark = SparkSession.builder.appName("GitHub").getOrCreate()
            return spark
        except py4j.protocol.Py4JNetworkError as e:
            retries += 1
            print(f"Retry {retries}/{max_retries} - Failed to create Spark session: {e}")
            time.sleep(retry_delay)
    raise Exception("Failed to create Spark session after multiple retries")

In [27]:
def process_files(files_by_day, downloaded_directory, processed_directory, max_retries=3, retry_delay=5):
    """
    Processes JSON files grouped by day, performing aggregations and saving results.

    This function processes JSON files grouped by their date, performing specific aggregations for repositories and users.
    The results are saved as CSV and Parquet files. It includes retry logic for handling failures during Spark session creation
    and file processing.

    Parameters:
    files_by_day (dict): A dictionary where the keys are dates (str) and the values are lists of file paths (str).
    downloaded_directory (str): The directory where the downloaded JSON files are located.
    processed_directory (str): The directory where the processed CSV and Parquet files will be saved.
    max_retries (int): The maximum number of retry attempts for creating the Spark session and processing each file (default is 3).
    retry_delay (int): The delay in seconds between retry attempts (default is 5 seconds).

    Prints:
    Progress and error messages, including retries and skipping of corrupted files.
    """
    # Create necessary directories if they don't exist
    os.makedirs(processed_directory, exist_ok=True)

    # Process files for each day
    for file_date, files in files_by_day.items():
        print(f"Processing files for {file_date}...")

        # Define the output paths
        repo_output_csv = os.path.join(processed_directory, f"repo_agg_{file_date}.csv")
        repo_output_parquet = os.path.join(processed_directory, f"repo_agg_{file_date}.parquet")
        user_output_csv = os.path.join(processed_directory, f"user_agg_{file_date}.csv")
        user_output_parquet = os.path.join(processed_directory, f"user_agg_{file_date}.parquet")

        # Check if the output files already exist
        if (os.path.exists(repo_output_csv) and os.path.exists(repo_output_parquet) and
            os.path.exists(user_output_csv) and os.path.exists(user_output_parquet)):
            print(f"Output files for {file_date} already exist. Skipping processing.")
            continue

        spark = create_spark_session(max_retries, retry_delay)

        # Process each file individually to isolate any corrupted files
        for file in files:
            file_path = os.path.join(downloaded_directory, file)

            # Validate JSON file before processing
            if not is_valid_json(file_path):
                print(f"Skipping corrupted file {file_path}")
                continue

            retries = 0
            success = False

            while retries < max_retries and not success:
                try:
                    # Read JSON file into DataFrame
                    df = spark.read.json(file_path)

                    # Extract necessary fields
                    repo_df = df.select(to_date(col("created_at")).alias("date"),
                                        col("repo.id").alias("project_id"),
                                        col("repo.name").alias("project_name"),
                                        col("type"))

                    user_df = df.select(to_date(col("created_at")).alias("date"),
                                        col("actor.id").alias("user_id"),
                                        col("actor.login").alias("user_login"),
                                        col("type"))

                    # Aggregations for repository
                    repo_agg = repo_df.groupBy("date", "project_id", "project_name").agg(
                        count(when(col("type") == "WatchEvent", True)).alias("stars"),
                        count(when(col("type") == "ForkEvent", True)).alias("forks"),
                        count(when(col("type") == "IssuesEvent", True)).alias("issues"),
                        count(when(col("type") == "PullRequestEvent", True)).alias("prs")
                    )

                    # Aggregations for user
                    user_agg = user_df.groupBy("date", "user_id", "user_login").agg(
                        count(when(col("type") == "WatchEvent", True)).alias("starred_projects"),
                        count(when(col("type") == "IssuesEvent", True)).alias("issues_created"),
                        count(when(col("type") == "PullRequestEvent", True)).alias("prs_created")
                    )

                    # Save the results for the current file
                    repo_agg.write.csv(repo_output_csv, header=True, mode='overwrite')
                    repo_agg.write.parquet(repo_output_parquet, mode='overwrite')
                    user_agg.write.csv(user_output_csv, header=True, mode='overwrite')
                    user_agg.write.parquet(user_output_parquet, mode='overwrite')

                    print(f"Processed and saved data for file {file}")
                    success = True
                except py4j.protocol.Py4JJavaError as e:
                    retries += 1
                    print(f"Retry {retries}/{max_retries} - Failed to process file {file} for {file_date} due to Java error: {e}")
                    time.sleep(retry_delay)
                except py4j.protocol.Py4JNetworkError as e:
                    retries += 1
                    print(f"Retry {retries}/{max_retries} - Failed to process file {file} for {file_date} due to network error: {e}")
                    time.sleep(retry_delay)
                except Exception as e:
                    print(f"Failed to process file {file} for {file_date}: {e}")
                    break

        # Stop the Spark session
        spark.stop()


  1%|          | 408k/53.9M [00:00<00:13, 4.05MiB/s]

In [28]:
process_files(files_by_day, output_dir, output_directory)

 63%|██████▎   | 31.5M/49.7M [00:01<00:01, 14.8MiB/s]

Processing files for 2024-05-03...


100%|██████████| 49.7M/49.7M [00:03<00:00, 15.4MiB/s]
100%|██████████| 55.6M/55.6M [00:02<00:00, 24.5MiB/s]
100%|██████████| 56.9M/56.9M [00:01<00:00, 29.2MiB/s]
100%|██████████| 57.1M/57.1M [00:01<00:00, 31.1MiB/s]
100%|██████████| 64.3M/64.3M [00:02<00:00, 28.8MiB/s]
100%|██████████| 64.3M/64.3M [00:03<00:00, 20.3MiB/s]
100%|██████████| 60.9M/60.9M [00:03<00:00, 18.1MiB/s]
100%|██████████| 64.7M/64.7M [00:03<00:00, 18.6MiB/s]
100%|██████████| 69.6M/69.6M [00:03<00:00, 19.4MiB/s]
100%|██████████| 68.1M/68.1M [00:03<00:00, 21.2MiB/s]
100%|██████████| 71.2M/71.2M [00:03<00:00, 19.1MiB/s]
100%|██████████| 73.3M/73.3M [00:03<00:00, 19.6MiB/s]
100%|██████████| 72.2M/72.2M [00:03<00:00, 18.3MiB/s]
100%|██████████| 67.0M/67.0M [00:02<00:00, 27.0MiB/s]
100%|██████████| 68.6M/68.6M [00:02<00:00, 33.1MiB/s]
100%|██████████| 64.0M/64.0M [00:02<00:00, 24.2MiB/s]
100%|██████████| 61.4M/61.4M [00:01<00:00, 32.6MiB/s]
100%|██████████| 55.4M/55.4M [00:03<00:00, 14.7MiB/s]
100%|██████████| 49.0M/49.0M

Processed and saved data for file /path/to/github_events/downloaded/2024-05-03/2024-05-03-1.json.gz


100%|██████████| 110M/110M [00:04<00:00, 22.4MiB/s]
100%|██████████| 103M/103M [00:05<00:00, 19.2MiB/s]
100%|██████████| 101M/101M [00:05<00:00, 20.0MiB/s] 
100%|██████████| 99.1M/99.1M [00:04<00:00, 19.9MiB/s]
100%|██████████| 102M/102M [00:05<00:00, 18.9MiB/s]
100%|██████████| 123M/123M [00:06<00:00, 18.4MiB/s]
100%|██████████| 131M/131M [00:03<00:00, 36.1MiB/s]
100%|██████████| 142M/142M [00:03<00:00, 38.3MiB/s]
100%|██████████| 142M/142M [00:06<00:00, 23.3MiB/s]
100%|██████████| 136M/136M [00:05<00:00, 23.1MiB/s]
100%|██████████| 137M/137M [00:04<00:00, 28.9MiB/s]
100%|██████████| 144M/144M [00:06<00:00, 21.5MiB/s]
100%|██████████| 156M/156M [00:04<00:00, 35.6MiB/s]
100%|██████████| 175M/175M [00:07<00:00, 24.5MiB/s]
100%|██████████| 168M/168M [00:04<00:00, 34.0MiB/s]
100%|██████████| 164M/164M [00:07<00:00, 22.8MiB/s]
100%|██████████| 156M/156M [00:04<00:00, 31.1MiB/s]
 50%|█████     | 79.1M/158M [00:04<00:06, 12.3MiB/s]

Processed and saved data for file /path/to/github_events/downloaded/2024-05-03/2024-05-03-5.json.gz


100%|██████████| 158M/158M [00:08<00:00, 17.6MiB/s]
100%|██████████| 149M/149M [00:09<00:00, 15.1MiB/s]
100%|██████████| 143M/143M [00:07<00:00, 19.2MiB/s]
100%|██████████| 131M/131M [00:06<00:00, 21.5MiB/s]
100%|██████████| 112M/112M [00:03<00:00, 32.3MiB/s]
100%|██████████| 92.1M/92.1M [00:02<00:00, 31.3MiB/s]
100%|██████████| 90.6M/90.6M [00:03<00:00, 25.4MiB/s]
100%|██████████| 86.3M/86.3M [00:03<00:00, 26.5MiB/s]
100%|██████████| 79.6M/79.6M [00:03<00:00, 21.1MiB/s]
100%|██████████| 81.1M/81.1M [00:02<00:00, 28.3MiB/s]
100%|██████████| 77.2M/77.2M [00:03<00:00, 21.8MiB/s]
 57%|█████▋    | 55.4M/96.7M [00:02<00:01, 38.1MiB/s]ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=45>

During handling of the above exception, another exception occurred

Failed to process file /path/to/github_events/downloaded/2024-05-03/2024-05-03-6.json.gz for 2024-05-03: An error occurred while calling o284.csv


100%|██████████| 96.7M/96.7M [00:04<00:00, 22.1MiB/s]
100%|██████████| 102M/102M [00:05<00:00, 17.0MiB/s]
100%|██████████| 114M/114M [00:05<00:00, 19.7MiB/s]
100%|██████████| 125M/125M [00:06<00:00, 19.2MiB/s]
100%|██████████| 125M/125M [00:05<00:00, 21.5MiB/s]
100%|██████████| 116M/116M [00:03<00:00, 33.0MiB/s]
100%|██████████| 117M/117M [00:03<00:00, 29.4MiB/s]
100%|██████████| 116M/116M [00:05<00:00, 21.1MiB/s]
100%|██████████| 134M/134M [00:04<00:00, 27.9MiB/s]
100%|██████████| 144M/144M [00:05<00:00, 25.8MiB/s]
100%|██████████| 138M/138M [00:05<00:00, 26.5MiB/s]
100%|██████████| 125M/125M [00:04<00:00, 28.6MiB/s]
100%|██████████| 118M/118M [00:05<00:00, 22.3MiB/s]
100%|██████████| 117M/117M [00:04<00:00, 27.2MiB/s]
100%|██████████| 111M/111M [00:04<00:00, 26.3MiB/s]
100%|██████████| 110M/110M [00:05<00:00, 19.9MiB/s]
100%|██████████| 109M/109M [00:03<00:00, 32.1MiB/s]
100%|██████████| 89.5M/89.5M [00:02<00:00, 31.9MiB/s]
100%|██████████| 74.7M/74.7M [00:02<00:00, 29.2MiB/s]
  4%|▍

Processed and saved data for file /path/to/github_events/downloaded/2024-05-03/2024-05-03-4.json.gz


100%|██████████| 84.2M/84.2M [00:04<00:00, 17.0MiB/s]
100%|██████████| 76.4M/76.4M [00:03<00:00, 19.6MiB/s]
100%|██████████| 74.3M/74.3M [00:03<00:00, 20.1MiB/s]
100%|██████████| 70.9M/70.9M [00:03<00:00, 21.0MiB/s]
100%|██████████| 68.8M/68.8M [00:03<00:00, 19.8MiB/s]
100%|██████████| 85.7M/85.7M [00:04<00:00, 20.0MiB/s]
100%|██████████| 87.2M/87.2M [00:03<00:00, 23.3MiB/s]
100%|██████████| 102M/102M [00:03<00:00, 26.3MiB/s]
100%|██████████| 114M/114M [00:02<00:00, 38.9MiB/s]
100%|██████████| 116M/116M [00:02<00:00, 40.1MiB/s]
100%|██████████| 105M/105M [00:05<00:00, 20.6MiB/s]
100%|██████████| 109M/109M [00:03<00:00, 27.3MiB/s]
100%|██████████| 108M/108M [00:04<00:00, 24.5MiB/s]
100%|██████████| 121M/121M [00:05<00:00, 20.9MiB/s]
100%|██████████| 127M/127M [00:04<00:00, 30.8MiB/s]
100%|██████████| 121M/121M [00:04<00:00, 28.3MiB/s]
100%|██████████| 115M/115M [00:05<00:00, 20.8MiB/s]
100%|██████████| 136M/136M [00:06<00:00, 20.2MiB/s]
100%|██████████| 106M/106M [00:04<00:00, 24.2MiB/s

Processed and saved data for file /path/to/github_events/downloaded/2024-05-03/2024-05-03-2.json.gz


100%|██████████| 102M/102M [00:12<00:00, 7.82MiB/s]
100%|██████████| 100M/100M [00:07<00:00, 14.1MiB/s] 
100%|██████████| 95.2M/95.2M [00:05<00:00, 16.6MiB/s]
100%|██████████| 75.6M/75.6M [00:04<00:00, 15.7MiB/s]
100%|██████████| 80.8M/80.8M [00:03<00:00, 24.5MiB/s]
100%|██████████| 77.4M/77.4M [00:04<00:00, 19.0MiB/s]
100%|██████████| 76.3M/76.3M [00:03<00:00, 24.7MiB/s]
100%|██████████| 71.2M/71.2M [00:02<00:00, 29.3MiB/s]
100%|██████████| 66.7M/66.7M [00:02<00:00, 25.0MiB/s]
100%|██████████| 79.5M/79.5M [00:04<00:00, 19.2MiB/s]
100%|██████████| 81.4M/81.4M [00:03<00:00, 23.1MiB/s]
100%|██████████| 88.4M/88.4M [00:03<00:00, 26.4MiB/s]
100%|██████████| 122M/122M [00:05<00:00, 20.9MiB/s]
100%|██████████| 123M/123M [00:04<00:00, 27.5MiB/s]
100%|██████████| 122M/122M [00:04<00:00, 26.5MiB/s]
100%|██████████| 121M/121M [00:05<00:00, 23.7MiB/s]
100%|██████████| 113M/113M [00:04<00:00, 25.5MiB/s]
100%|██████████| 112M/112M [00:05<00:00, 21.9MiB/s]
100%|██████████| 117M/117M [00:06<00:00, 18

Processed and saved data for file /path/to/github_events/downloaded/2024-05-03/2024-05-03-3.json.gz


100%|██████████| 119M/119M [00:05<00:00, 20.9MiB/s]
100%|██████████| 113M/113M [00:05<00:00, 20.0MiB/s]
100%|██████████| 108M/108M [00:05<00:00, 18.8MiB/s]
100%|██████████| 105M/105M [00:05<00:00, 19.9MiB/s]
100%|██████████| 97.8M/97.8M [00:05<00:00, 18.8MiB/s]
100%|██████████| 95.4M/95.4M [00:03<00:00, 24.2MiB/s]
100%|██████████| 89.2M/89.2M [00:04<00:00, 22.3MiB/s]
100%|██████████| 81.3M/81.3M [00:02<00:00, 27.2MiB/s]
100%|██████████| 76.6M/76.6M [00:02<00:00, 29.9MiB/s]
100%|██████████| 80.1M/80.1M [00:02<00:00, 29.2MiB/s]
100%|██████████| 84.4M/84.4M [00:04<00:00, 20.5MiB/s]
100%|██████████| 81.3M/81.3M [00:03<00:00, 26.9MiB/s]
100%|██████████| 75.5M/75.5M [00:02<00:00, 31.3MiB/s]
100%|██████████| 73.6M/73.6M [00:02<00:00, 30.1MiB/s]
100%|██████████| 86.1M/86.1M [00:02<00:00, 30.2MiB/s]
100%|██████████| 90.3M/90.3M [00:03<00:00, 22.6MiB/s]
100%|██████████| 95.3M/95.3M [00:02<00:00, 32.9MiB/s]
100%|██████████| 107M/107M [00:03<00:00, 31.3MiB/s]
100%|██████████| 103M/103M [00:06<00:0

Processed and saved data for file /path/to/github_events/downloaded/2024-05-03/2024-05-03-0.json.gz


 20%|█▉        | 20.9M/106M [00:00<00:02, 32.8MiB/s]

Processing files for 2024-05-01...


100%|██████████| 106M/106M [00:04<00:00, 22.6MiB/s]
100%|██████████| 97.6M/97.6M [00:05<00:00, 19.4MiB/s]
100%|██████████| 94.7M/94.7M [00:04<00:00, 20.2MiB/s]
100%|██████████| 92.1M/92.1M [00:04<00:00, 19.2MiB/s]
100%|██████████| 88.9M/88.9M [00:05<00:00, 17.3MiB/s]
100%|██████████| 87.9M/87.9M [00:04<00:00, 19.0MiB/s]
100%|██████████| 69.7M/69.7M [00:03<00:00, 17.5MiB/s]
100%|██████████| 64.3M/64.3M [00:02<00:00, 27.1MiB/s]
100%|██████████| 64.3M/64.3M [00:01<00:00, 36.0MiB/s]
100%|██████████| 62.1M/62.1M [00:01<00:00, 35.8MiB/s]
100%|██████████| 56.0M/56.0M [00:01<00:00, 33.7MiB/s]
100%|██████████| 53.2M/53.2M [00:02<00:00, 19.9MiB/s]
100%|██████████| 47.2M/47.2M [00:01<00:00, 23.6MiB/s]
100%|██████████| 53.2M/53.2M [00:01<00:00, 38.6MiB/s]
100%|██████████| 55.0M/55.0M [00:01<00:00, 34.6MiB/s]
100%|██████████| 59.4M/59.4M [00:01<00:00, 35.2MiB/s]
100%|██████████| 60.8M/60.8M [00:02<00:00, 23.6MiB/s]
100%|██████████| 59.1M/59.1M [00:03<00:00, 19.1MiB/s]
100%|██████████| 57.2M/57.2M [

Processed and saved data for file /path/to/github_events/downloaded/2024-05-01/2024-05-01-22.json.gz


100%|██████████| 49.8M/49.8M [00:02<00:00, 20.2MiB/s]
100%|██████████| 52.8M/52.8M [00:02<00:00, 20.3MiB/s]
100%|██████████| 50.8M/50.8M [00:02<00:00, 20.7MiB/s]
100%|██████████| 52.5M/52.5M [00:03<00:00, 15.8MiB/s]
100%|██████████| 63.8M/63.8M [00:03<00:00, 19.6MiB/s]
100%|██████████| 63.0M/63.0M [00:03<00:00, 19.3MiB/s]
100%|██████████| 65.6M/65.6M [00:03<00:00, 21.1MiB/s]
100%|██████████| 64.4M/64.4M [00:03<00:00, 18.9MiB/s]
100%|██████████| 64.1M/64.1M [00:03<00:00, 18.9MiB/s]
100%|██████████| 57.8M/57.8M [00:03<00:00, 18.7MiB/s]
100%|██████████| 60.9M/60.9M [00:02<00:00, 20.6MiB/s]
100%|██████████| 58.7M/58.7M [00:02<00:00, 20.1MiB/s]
100%|██████████| 51.3M/51.3M [00:02<00:00, 19.3MiB/s]
100%|██████████| 49.2M/49.2M [00:01<00:00, 35.7MiB/s]
100%|██████████| 44.8M/44.8M [00:01<00:00, 34.2MiB/s]
100%|██████████| 44.4M/44.4M [00:02<00:00, 22.1MiB/s]
100%|██████████| 74.9M/74.9M [00:02<00:00, 25.6MiB/s]
100%|██████████| 89.6M/89.6M [00:02<00:00, 32.3MiB/s]
100%|██████████| 80.2M/80.2M

Processed and saved data for file /path/to/github_events/downloaded/2024-05-01/2024-05-01-16.json.gz


100%|██████████| 111M/111M [00:05<00:00, 19.5MiB/s]
100%|██████████| 116M/116M [00:05<00:00, 19.5MiB/s]
100%|██████████| 96.1M/96.1M [00:04<00:00, 20.3MiB/s]
100%|██████████| 88.0M/88.0M [00:04<00:00, 19.3MiB/s]
100%|██████████| 81.7M/81.7M [00:04<00:00, 19.1MiB/s]
100%|██████████| 82.0M/82.0M [00:04<00:00, 19.6MiB/s]
100%|██████████| 75.7M/75.7M [00:03<00:00, 25.1MiB/s]
100%|██████████| 97.7M/97.7M [00:03<00:00, 32.2MiB/s]
100%|██████████| 96.0M/96.0M [00:05<00:00, 18.0MiB/s]
100%|██████████| 109M/109M [00:03<00:00, 31.6MiB/s]
100%|██████████| 115M/115M [00:04<00:00, 28.6MiB/s]
100%|██████████| 114M/114M [00:04<00:00, 25.8MiB/s]
100%|██████████| 104M/104M [00:03<00:00, 29.8MiB/s]
100%|██████████| 104M/104M [00:02<00:00, 34.8MiB/s]
100%|██████████| 117M/117M [00:03<00:00, 31.9MiB/s]
100%|██████████| 124M/124M [00:05<00:00, 22.5MiB/s]
100%|██████████| 127M/127M [00:03<00:00, 34.7MiB/s]
100%|██████████| 127M/127M [00:04<00:00, 28.9MiB/s]
100%|██████████| 114M/114M [00:04<00:00, 23.0MiB/s

Processed and saved data for file /path/to/github_events/downloaded/2024-05-01/2024-05-01-5.json.gz


100%|██████████| 91.3M/91.3M [00:05<00:00, 16.6MiB/s]
100%|██████████| 74.5M/74.5M [00:04<00:00, 18.0MiB/s]
100%|██████████| 78.1M/78.1M [00:04<00:00, 19.1MiB/s]
100%|██████████| 73.3M/73.3M [00:03<00:00, 20.5MiB/s]
100%|██████████| 71.4M/71.4M [00:03<00:00, 19.8MiB/s]
100%|██████████| 73.2M/73.2M [00:03<00:00, 19.9MiB/s]
100%|██████████| 69.0M/69.0M [00:03<00:00, 18.2MiB/s]
100%|██████████| 87.9M/87.9M [00:12<00:00, 7.16MiB/s]
100%|██████████| 93.9M/93.9M [00:04<00:00, 20.7MiB/s]
100%|██████████| 106M/106M [00:03<00:00, 29.9MiB/s]
100%|██████████| 121M/121M [00:05<00:00, 22.6MiB/s]
100%|██████████| 114M/114M [00:04<00:00, 26.6MiB/s]
100%|██████████| 103M/103M [00:02<00:00, 34.5MiB/s]
100%|██████████| 111M/111M [00:03<00:00, 32.9MiB/s]
100%|██████████| 117M/117M [00:05<00:00, 22.9MiB/s]
100%|██████████| 130M/130M [00:04<00:00, 27.9MiB/s]
100%|██████████| 134M/134M [00:04<00:00, 32.7MiB/s]
100%|██████████| 130M/130M [00:06<00:00, 21.3MiB/s]
100%|██████████| 118M/118M [00:03<00:00, 32.7M

Processed and saved data for file /path/to/github_events/downloaded/2024-05-01/2024-05-01-21.json.gz


100%|██████████| 74.7M/74.7M [00:03<00:00, 20.8MiB/s]
100%|██████████| 74.1M/74.1M [00:04<00:00, 18.3MiB/s]
100%|██████████| 65.7M/65.7M [00:03<00:00, 19.5MiB/s]
100%|██████████| 82.3M/82.3M [00:05<00:00, 14.4MiB/s]
100%|██████████| 92.5M/92.5M [00:06<00:00, 14.4MiB/s]
100%|██████████| 103M/103M [00:10<00:00, 10.2MiB/s]
100%|██████████| 115M/115M [00:06<00:00, 17.7MiB/s]
100%|██████████| 115M/115M [00:06<00:00, 17.0MiB/s]
100%|██████████| 104M/104M [00:06<00:00, 16.8MiB/s]
100%|██████████| 107M/107M [00:04<00:00, 22.5MiB/s]
 39%|███▉      | 46.9M/119M [00:01<00:02, 35.6MiB/s]ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=45>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/loca

Failed to process file /path/to/github_events/downloaded/2024-05-01/2024-05-01-17.json.gz for 2024-05-01: An error occurred while calling o1023.json


100%|██████████| 119M/119M [00:05<00:00, 22.3MiB/s]
100%|██████████| 121M/121M [00:06<00:00, 18.7MiB/s]
100%|██████████| 131M/131M [00:07<00:00, 18.0MiB/s]
100%|██████████| 125M/125M [00:06<00:00, 18.6MiB/s]
100%|██████████| 117M/117M [00:06<00:00, 18.6MiB/s]
 72%|███████▏  | 88.8M/123M [00:05<00:04, 7.23MiB/s]ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=45>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py

Failed to process file /path/to/github_events/downloaded/2024-05-01/2024-05-01-7.json.gz for 2024-05-01: An error occurred while calling o1028.json


 91%|█████████▏| 113M/123M [00:07<00:00, 11.1MiB/s]

KeyboardInterrupt: 

 93%|█████████▎| 114M/123M [00:07<00:00, 11.8MiB/s]

In [None]:
os.makedirs(zip_directory, exist_ok=True)

In [None]:

def zip_all_subdirectories(output_directory, zip_directory):
    """
    Zips all subdirectories within the specified output directory.

    This function traverses the specified output directory, zipping all subdirectories
    that contain files. Each zip file is saved in a corresponding subdirectory within
    the specified zip directory. The zip files are named based on the relative path
    of the subdirectory, with '/' replaced by '_'.

    Parameters:
    output_directory (str): The base directory containing the subdirectories to be zipped.
    zip_directory (str): The base directory where the zip files will be saved.

    Prints:
    Progress messages, including skipping existing zip files and any errors encountered
    during the zipping process.

    """
    for root, dirs, files in os.walk(output_directory):
        # Process each subdirectory containing files
        if files:
            subdirectory_name = os.path.relpath(root, output_directory)
            zip_file_name = f"{subdirectory_name.replace('/', '_')}.zip"

            # Create the daily subdirectory in the base zip directory
            date_subdirectory = os.path.join(zip_directory, subdirectory_name)
            os.makedirs(date_subdirectory, exist_ok=True)

            zip_file_path = os.path.join(date_subdirectory, zip_file_name)

            # Check if the zip file already exists
            if os.path.exists(zip_file_path):
                print(f"Zip file {zip_file_path} already exists. Skipping...")
                continue

            # Create the zip file
            try:
                with ZipFile(zip_file_path, 'w') as zipf:
                    for file in files:
                        file_path = os.path.join(root, file)
                        zipf.write(file_path, os.path.relpath(file_path, output_directory))
                print(f"Zipped {root} to {zip_file_path}")
            except Exception as e:
                print(f"An error occurred while creating the zip file: {e}")

In [None]:
zip_all_subdirectories(output_directory, zip_directory)

In [None]:
def display_download_links(zip_directory):
    for root, dirs, files in os.walk(zip_directory):
        for file in files:
            if file.endswith('.zip'):
                file_path = os.path.join(root, file)
                display(FileLink(file_path))



In [None]:
display_download_links(zip_directory)