In [1]:
from pyspark.sql import SparkSession
import os
spark = SparkSession.builder.appName('TestSession').getOrCreate()
spark

In [2]:
WEEKLY_DATA_PATH = r'../data/weekly_data/' 

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, ArrayType

In [None]:
#Top_sellers
schema_top_sellers = StructType([
    StructField("Rank", IntegerType(), True),
    StructField("Game Name", StringType(), True),
    StructField("Free to Play", IntegerType(), True),
    StructField("App ID", IntegerType(), True),
    StructField("Collection Date", StringType(), True)
    ])

WEEKLY_TOP_SELLERS_PATH = WEEKLY_DATA_PATH + r'top_sellers/'
files = os.listdir(WEEKLY_TOP_SELLERS_PATH)

FILE_DATE = None
try:
    csv_file = [f for f in files if f.endswith('.csv')]
    file = csv_file[0]
    FILE_DATE = file.split('.')[0].split('_')[0]
    top_sellers = spark.read.csv(
        WEEKLY_TOP_SELLERS_PATH + file,
        header=True,
        schema=schema_top_sellers  
    )
    top_sellers.cache()
    top_sellers.show()
except Exception as e:
    print("An error occurred while reading the CSV file:", e)

In [44]:
#news
weekly_top_news_schema = StructType([
    StructField("appnews", StructType([
        StructField("appid", IntegerType(), nullable=False),
        StructField("newsitems", ArrayType(StructType([
            StructField("gid", StringType(), nullable=False),
            StructField("title", StringType(), nullable=False),
            StructField("url", StringType(), nullable=False),
            StructField("is_external_url", BooleanType(), nullable=False),
            StructField("author", StringType(), nullable=False),
            StructField("contents", StringType(), nullable=False),
            StructField("feedlabel", StringType(), nullable=False),
            StructField("date", IntegerType(), nullable=False),
            StructField("feedname", StringType(), nullable=False),
            StructField("feed_type", IntegerType(), nullable=False),
            StructField("appid", IntegerType(), nullable=False),
            StructField("tags", ArrayType(StringType(), containsNull=False), nullable=True)
        ]), containsNull=True), nullable=False),
        StructField("count", IntegerType(), nullable=False)
    ]), nullable=True)
])

merged_df = None
WEEKLY_TOP_NEWS_PATH = WEEKLY_DATA_PATH + r'news/'
files = os.listdir(WEEKLY_TOP_NEWS_PATH)
try:
    json_files = [pos_json for pos_json in files if pos_json.endswith('.json')]
    for file in json_files:
        steam_game_news = spark.read.json(
            WEEKLY_TOP_NEWS_PATH + file,
            multiLine=True,
            schema = weekly_top_news_schema     
        )
        if merged_df is None:
            merged_df = steam_game_news
        else:
            merged_df = merged_df.union(steam_game_news)
except:
    print('No json files found')


In [45]:
merged_df.show(5)

+--------------------+
|             appnews|
+--------------------+
|{1086940, [{51245...|
|{1172470, [{51245...|
|{1282100, [{51245...|
|{1675200, [{51245...|
|{1895880, [{51245...|
+--------------------+
only showing top 5 rows



In [43]:
steam_game_news.show()

+--------------------+
|             appnews|
+--------------------+
|{730, [{512458531...|
+--------------------+



In [None]:
#reviews
WEEKLY_TOP_10_REVIEWS_PATH = WEEKLY_DATA_PATH + r'reviews/'
files = os.listdir(WEEKLY_DATA_PATH)
try:
    csv_file = [f for f in files if f.endswith('.txt')]
    for file in csv_file:
        reviews = spark.read.text(
            WEEKLY_TOP_10_REVIEWS_PATH + file,
        )
        reviews.cache()
        reviews.show()
        break
except Exception as e:
    print("An error occurred while reading the CSV file:", e)