In [1]:
import requests
import os
from datetime import datetime, timedelta

In [4]:
def download_gh_archive_data(start_date, end_date, download_dir="./github_archive"):
    """
    Скачивает данные с GH Archive за указанный период.

    Args:
        start_date (str): Начальная дата в формате 'YYYY-MM-DD'.
        end_date (str): Конечная дата в формате 'YYYY-MM-DD'.
        download_dir (str): Директория для сохранения файлов.
    """

    # Создаем директорию, если она не существует
    os.makedirs(download_dir, exist_ok=True)

    # Преобразуем строки в объекты datetime
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")

    current = start
    while current <= end:
        for hour in range(24): # Проходим по всем 24 часам каждого дня
            # Форматируем дату и час для имени файла
            file_name = current.strftime("%Y-%m-%d") + f"-{hour}.json.gz"
            url = f"https://data.gharchive.org/{file_name}"
            file_path = os.path.join(download_dir, file_name)

            print(f"Скачиваю: {file_name}...")

            try:
                # Отправляем запрос
                response = requests.get(url, stream=True)
                response.raise_for_status() # Проверяем на ошибки HTTP

                # Записываем файл на диск
                with open(file_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                print(f"Успешно: {file_name}")

            except requests.exceptions.RequestException as e:
                print(f"Ошибка при скачивании {file_name}: {e}")

        # Переходим к следующему дню
        current += timedelta(days=1)

    print("Загрузка завершена.")

In [None]:
# ИСПОЛЬЗОВАНИЕ:
# Скачайте все данные с 1 по 3 января 2024 года.
download_gh_archive_data("2025-10-23", "2025-10-23")

In [6]:
from pyspark.sql import SparkSession

drivers = [
    "/opt/spark/external-jars/hadoop-aws-3.3.4.jar",
    "/opt/spark/external-jars/aws-java-sdk-bundle-1.12.262.jar",
    "/opt/spark/external-jars/wildfly-openssl-1.0.7.Final.jar",
    "/opt/spark/external-jars/postgresql-42.6.0.jar",
]

spark = (SparkSession.builder
         .appName("jupyter-spark")
         .master("spark://spark-master:7077")
         .config("spark.jars", ",".join(drivers))
         .getOrCreate()
        )

25/11/21 21:43:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [15]:
from pyspark.sql import functions as F

In [49]:
df = (spark.read
      .option("multiline", "false")
      .json("/shared_data/github_archive/2025-10-23*.json")
      .repartition(100)
      .cache()
     )

                                                                                

In [52]:
mydf = df.filter(F.col("actor.login") == "mustdayker").cache()

In [53]:
print(f"Найдено событий: {mydf.count()}")

Найдено событий: 22


In [76]:
(
    mydf.select(
        F.date_format("created_at", "yyyy-MM-dd HH:mm").alias("created_at"),
        F.col("type"),
        F.col("actor.login").alias("actor_login"),
        # F.col("actor.url").alias("actor_url"),
        F.col("repo.name").alias("repo_name"),
        F.col("repo.url").alias("repo_url"),     
       )
    .orderBy(F.asc("created_at"))
    .show(30, truncate=False)
)

+----------------+-----------+-----------+---------------------+--------------------------------------------------+
|created_at      |type       |actor_login|repo_name            |repo_url                                          |
+----------------+-----------+-----------+---------------------+--------------------------------------------------+
|2025-10-23 07:53|CreateEvent|mustdayker |mustdayker/kakoy_kaif|https://api.github.com/repos/mustdayker/kakoy_kaif|
|2025-10-23 08:01|PushEvent  |mustdayker |mustdayker/kakoy_kaif|https://api.github.com/repos/mustdayker/kakoy_kaif|
|2025-10-23 08:10|PushEvent  |mustdayker |mustdayker/kakoy_kaif|https://api.github.com/repos/mustdayker/kakoy_kaif|
|2025-10-23 08:12|PushEvent  |mustdayker |mustdayker/kakoy_kaif|https://api.github.com/repos/mustdayker/kakoy_kaif|
|2025-10-23 17:43|PushEvent  |mustdayker |mustdayker/kakoy_kaif|https://api.github.com/repos/mustdayker/kakoy_kaif|
|2025-10-23 17:59|PushEvent  |mustdayker |mustdayker/kakoy_kaif|https://

In [40]:
(
    df.select(
# actor
        F.col("actor.id").alias("actor_id"),           
        F.col("actor.display_login").alias("actor_display_login"),
        F.col("actor.login").alias("actor_login"),     
        F.col("actor.url").alias("actor_url"),         
# repo
        F.col("repo.id").alias("repo_id"),             
        F.col("repo.name").alias("repo_name"),
        F.col("repo.url").alias("repo_url") 
        
    )
).show(1)

+--------------------+----------+--------+-------------------+-----------+--------------------+----------+----------------+--------------------+
|          created_at|      type|actor_id|actor_display_login|actor_login|           actor_url|   repo_id|       repo_name|            repo_url|
+--------------------+----------+--------+-------------------+-----------+--------------------+----------+----------------+--------------------+
|2025-10-23T00:00:00Z|WatchEvent| 3697022|         blakeroyer| blakeroyer|https://api.githu...|1073224795|obra/superpowers|https://api.githu...|
+--------------------+----------+--------+-------------------+-----------+--------------------+----------+----------------+--------------------+
only showing top 1 row



In [35]:
df.select("actor").printSchema()

root
 |-- actor: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- display_login: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)



In [77]:
spark.stop()