In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import os
spark = SparkSession.builder.appName('TestSession').getOrCreate()
spark

In [48]:
sc = spark.sparkContext

In [57]:
weekly_data_path = r'../data/weekly_data/' 


In [90]:
from pyspark.sql.types import StructType,DateType,LongType, StructField, StringType, IntegerType, BooleanType, ArrayType


In [62]:

schema_top_sellers = StructType([
    StructField("Rank", IntegerType(), True),
    StructField("Game Name", StringType(), True),
    StructField("Free to Play", IntegerType(), True),
    StructField("App ID", IntegerType(), True),
    StructField("Collection Date", DateType(), True),
    StructField("Price", StringType(), True)
    ])
WEEKLY_TOP_SELLERS_PATH = weekly_data_path + r'top_sellers/'
files = os.listdir(WEEKLY_TOP_SELLERS_PATH)

FILE_DATE = None
try:
    csv_files = [f for f in files if f.endswith('.csv')]
    for csv_file in csv_files:        
        top_sellers = spark.read.csv(
            WEEKLY_TOP_SELLERS_PATH + file,
            header=True,
            schema=schema_top_sellers
        )
        
        top_sellers= top_sellers.withColumn('price',col('price').cast(StringType()))
        top_sellers= top_sellers.withColumn('price',regexp_replace('price','\\$',""))
        top_sellers= top_sellers.withColumn('price',col('price').cast(IntegerType()))
        top_sellers= top_sellers.withColumn('Free to Play',when(col('Free to Play')==0,1).otherwise(0))
        top_sellers_df.cache()
        top_sellers_df.show()
        print(top_sellers.dtypes)

except Exception as e:
    print("An error occurred while reading the CSV file:", e)

+----+--------------------+------------+-------+---------------+-----+
|Rank|           Game Name|Free to Play| App ID|Collection Date|Price|
+----+--------------------+------------+-------+---------------+-----+
|   1|      Baldurs Gate 3|           0|1086940|     2023-08-15|   59|
|   2|CounterStrike Glo...|           1|    730|     2023-08-15|    0|
|   3|           Starfield|           0|1716740|     2023-08-15|   69|
|   4|       Madden NFL 24|           0|2140330|     2023-08-15|   69|
|   5|          Steam Deck|           0|1675200|     2023-08-15|  399|
|   6|The Texas Chain S...|           0|1433140|     2023-08-15|   39|
|   7| Bomb Rush Cyberfunk|           0|1353230|     2023-08-15|   39|
|   8|           Wayfinder|           0|1171690|     2023-08-15|   19|
|   9|        Call of Duty|           0|1938090|     2023-08-15|    0|
|  10|        Apex Legends|           1|1172470|     2023-08-15|    0|
|  11|            Lost Ark|           1|1599340|     2023-08-15|    0|
|  12|

In [111]:
#news
'''weekly_top_news_schema = StructType([
    StructField("appnews", StructType([
        StructField("appid", IntegerType(), nullable=False),
        StructField("newsitems", ArrayType(StructType([
            StructField("gid", StringType(), nullable=False),
            StructField("title", StringType(), nullable=False),
            StructField("url", StringType(), nullable=False),
            StructField("is_external_url", BooleanType(), nullable=False),
            StructField("author", StringType(), nullable=False),
            StructField("contents", StringType(), nullable=False),
            StructField("feedlabel", StringType(), nullable=False),
            StructField("date", IntegerType(), nullable=False),
            StructField("feedname", StringType(), nullable=False),
            StructField("feed_type", IntegerType(), nullable=False),
            StructField("appid", IntegerType(), nullable=False),
            StructField("tags", ArrayType(StringType(), containsNull=False), nullable=True)
        ]), containsNull=True), nullable=False),
        StructField("count", IntegerType(), nullable=False)
    ]), nullable=True)
])
'''
merged_df = None
WEEKLY_TOP_NEWS_PATH = weekly_data_path + r'news/'
files = os.listdir(WEEKLY_TOP_NEWS_PATH)

try:
    for file in files:
        steam_game_news = spark.read.json(
            WEEKLY_TOP_NEWS_PATH + file,
            multiLine=True
               
        )
        df_expanded = steam_game_news.select("appnews.appid", explode("appnews.newsitems").alias("newsitem"))
        merged_df = df_expanded.select(
            col("appid"),
            col("newsitem.gid").alias("gid"),
            col("newsitem.title").alias("title"),
            col("newsitem.contents").alias("News")
            
        )
       
        if merged_df is None:
            merged_df = merged_df
        else:
            merged_df = merged_df.union(merged_df)

except:
    print('No json files found')


In [114]:
merged_df.show()

+-------+-------------------+--------------------+--------------------+--------------------+
|  appid|                gid|               title|                News|                 url|
+-------+-------------------+--------------------+--------------------+--------------------+
|1172470|5127964289174700621|Apex Legends Asia...|https://www.youtu...|https://steamstor...|
|1172470|5127964289172274276|Apex Legends™ EA ...|Join EA Play to m...|https://steamstor...|
|1172470|5839532486225989565|Apex Legends almo...|<strong><a href="...|https://steamstor...|
|1172470|5839532486225989568|Apex Legends isn'...|As many of you pl...|https://steamstor...|
|1172470|5839532486225761017|Apex Legends is t...|For <a href="http...|https://steamstor...|
|1172470|5839532486211363026|Apex Legends shoc...|Fans of Respawn E...|https://steamstor...|
|1172470|5839532486210677808|Apex Legends Seas...|<strong>When is t...|https://steamstor...|
|1172470|5124585319868120117|APEX LEGENDS™: RE...|LEGEND UPDATE: RE...

In [None]:
schema_weekly_top_20_reviews= StructType([
    StructField("App Id",IntegerType(),True),
    StructField("Review",StringType(),True),
    StructField("Voted Up",StringType(),True)

])