In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

drivers = [
    "/opt/spark/external-jars/hadoop-aws-3.3.4.jar",
    "/opt/spark/external-jars/aws-java-sdk-bundle-1.12.262.jar",
    "/opt/spark/external-jars/wildfly-openssl-1.0.7.Final.jar",
    "/opt/spark/external-jars/postgresql-42.6.0.jar",
]

spark = (SparkSession.builder
         .appName("jupyter-spark")
         .master("spark://spark-master:7077")
         .config("spark.jars", ",".join(drivers))
         .getOrCreate()
        )

25/11/23 19:17:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Парсим нестандратный JSON в датафрейм

In [None]:
import json
from pyspark.sql.types import *

In [None]:
# Упрощенная схема без сложных вложенных структур
simple_schema = StructType([
    StructField("app_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("release_date", StringType(), True),
    StructField("required_age", IntegerType(), True),
    StructField("price", DoubleType(), True),
    StructField("developer", StringType(), True),
    StructField("publisher", StringType(), True),
    StructField("genres", ArrayType(StringType()), True),
    StructField("positive", IntegerType(), True),
    StructField("negative", IntegerType(), True),
    StructField("user_score", IntegerType(), True),
    StructField("score_rank", StringType(), True),
    StructField("windows", BooleanType(), True),
    StructField("mac", BooleanType(), True),
    StructField("linux", BooleanType(), True),
    StructField("reviews", StringType(), True),
    StructField("metacritic_score", IntegerType(), True),
    StructField("metacritic_url", StringType(), True),
    StructField("average_playtime_forever", IntegerType(), True),
    StructField("average_playtime_2weeks", IntegerType(), True),
    StructField("median_playtime_forever", IntegerType(), True),
    StructField("median_playtime_2weeks", IntegerType(), True),
    StructField("peak_ccu", IntegerType(), True),    
    # StructField("about_the_game", StringType(), True),
    # StructField("short_description", StringType(), True),
    # StructField("detailed_description", StringType(), True),
])

# Функция для извлечения только основных полей
def parse_simple_games(file_content):
    try:
        data = json.loads(file_content)
        results = []
        for app_id, game_data in data.items():
            # Извлекаем только основные поля
            simple_data = {
                'app_id': app_id,
                'name': game_data.get('name'),
                'release_date': game_data.get('release_date'),
                'required_age': game_data.get('required_age'),
                'price': game_data.get('price'),
                'developer': game_data.get('developers', [None])[0] if game_data.get('developers') else None,
                'publisher': game_data.get('publishers', [None])[0] if game_data.get('publishers') else None,
                'genres': game_data.get('genres', []),
                'positive': game_data.get('positive'),
                'negative': game_data.get('negative'),
                'user_score': game_data.get('user_score'),
                'score_rank': game_data.get('score_rank'),
                'windows': game_data.get('windows', False),
                'mac': game_data.get('mac', False),
                'linux': game_data.get('linux', False),
                'reviews': game_data.get('reviews', False),
                'metacritic_score': game_data.get('metacritic_score', False),
                'metacritic_url': game_data.get('metacritic_url', False),
                'average_playtime_forever': game_data.get('average_playtime_forever', False),
                'average_playtime_2weeks': game_data.get('average_playtime_2weeks', False),
                'median_playtime_forever': game_data.get('median_playtime_forever', False),
                'median_playtime_2weeks': game_data.get('median_playtime_2weeks', False),
                'peak_ccu': game_data.get('peak_ccu', False),
                # 'about_the_game': game_data.get('about_the_game', False),
                # 'short_description': game_data.get('short_description', False),
                # 'detailed_description': game_data.get('detailed_description', False),
            }
            results.append(simple_data)
        return results
    except Exception as e:
        print(f"Error parsing: {e}")
        return []

json_rdd = spark.sparkContext.wholeTextFiles("/shared_data/steam_games_dataset/games.json")
parsed_rdd = json_rdd.flatMap(lambda x: parse_simple_games(x[1]))
df_simple = spark.createDataFrame(parsed_rdd, schema=simple_schema)

print(f"Total games: {df_simple.count()}")
df_simple.show(1, truncate=False)

# Записываем датафрейм в привычный parquet

In [None]:
(df_simple.write.format("parquet")
    .option("compression", "snappy")
    .option("parquet.dictionary.enabled", "true")
    .option("parquet.bloom.filter.enabled", "true")
    .mode("overwrite")
    .save("s3a://bronze/steam/raw"))

# Читайем parquet, оставляем только нужную информацию и пишем в Postgre

In [2]:
df = spark.read.parquet("s3a://bronze/steam/raw")

25/11/23 19:18:16 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [3]:
df.count()

                                                                                

111452

In [54]:
df_to_postgre = (
    df.withColumn("release_date_p", 
                  F.coalesce(
                      # Формат "Dec 29, 2023"
                                F.to_date("release_date", "MMM d, yyyy"),
                      # Формат "Jun 2009" - добавляем 1-е число месяца
                                F.when(F.col("release_date").rlike("[A-Za-z]{3} \\d{4}"),
                                       F.to_date(F.concat(F.lit("1 "), "release_date"), "d MMM yyyy")),
                       # Можно добавить другие форматы при необходимости
                                F.to_date("release_date")  # пробуем авто-определение формата
                            )
                 )
    .select([
        'app_id',
        'name',
        'release_date_p',
        # 'release_date',
        # 'required_age',
        'price',
        'developer',
        'publisher',
        # 'genres',
        'positive',
        'negative',
        'user_score',
        'score_rank',
        'windows',
        'mac',
        'linux',
        # 'reviews',
        'metacritic_score',
        # 'metacritic_url',
        'average_playtime_forever',
        # 'average_playtime_2weeks',
        # 'median_playtime_forever',
        # 'median_playtime_2weeks',
        'peak_ccu'
    ])
)

In [56]:
(df_to_postgre.write.format("jdbc")
         .option("url", "jdbc:postgresql://postgres-db:5432/learn_base")
         .option("driver", "org.postgresql.Driver")
         .option("user", "airflow")
         .option("password", "airflow")
         .option("dbtable", "steam.games_database")
         .mode("overwrite")
         .save())

                                                                                

In [57]:
spark.stop()