In [1]:
# Spark Code to process Daily Data

'''
Input File:
    - daily_data/ *.csv

Produces the following csv files:
    - top_20
    - top_free
    - top_not_free
'''

'\nInput File:\n    - daily_data/ *.csv\n\nProduces the following csv files:\n    - top_20\n    - top_free\n    - top_not_free\n'

In [2]:
from pyspark.sql import SparkSession
import os
spark = SparkSession.builder.appName('daily_spark').getOrCreate()
#spark

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType

# Define schema for our data using DDL
schema = StructType([
    StructField("Rank", IntegerType(), True),
    StructField("Game Name", StringType(), True),
    StructField("Free to Play", IntegerType(), True),
    StructField("Current Players", IntegerType(), True),
    StructField("Peek Today", IntegerType(), True),
    StructField("Collection Date", DateType(), True)
])

In [4]:
DAILY_DATA_PATH = r'../data/daily_data/most_played/' 
files = os.listdir(DAILY_DATA_PATH)

FILE_DATE = None
try:
    csv_file = [f for f in files if f.endswith('.csv')]
    file = csv_file[0]
    most_daily_played = spark.read.csv(DAILY_DATA_PATH + file, header=True, schema=schema)
except Exception as e:
    print("An error occurred while reading the JSON file:", e)  


In [5]:
# Cleaning Game Name
from pyspark.sql.functions import regexp_replace, col
special_characters = ["™", "®"]

for char in special_characters:
    most_daily_played = most_daily_played.withColumn("Game Name", regexp_replace(col("Game Name"), char, ""))

most_daily_played.cache()
#most_daily_played.show()    

DataFrame[Rank: int, Game Name: string, Free to Play: int, Current Players: int, Peek Today: int, Collection Date: date]

In [6]:
# Filter free to play games and create a new DataFrame
free_to_play_df = most_daily_played.filter(most_daily_played["Free to Play"] == 1)
not_free_to_play_df = most_daily_played.filter(most_daily_played["Free to Play"] == 0)

# Sort by Peek Today
free_to_play_sorted = free_to_play_df.orderBy("Peek Today")
not_free_to_play_sorted = not_free_to_play_df.orderBy("Peek Today")

In [7]:
path_top_20 = r"../saved_data/daily_data/top_20"
path_top_free = r"../saved_data/daily_data/top_free"
path_top_not_free = r"../saved_data/daily_data/top_not_free"

# Save the DataFrame as CSV
most_daily_played.write.format("csv").mode("overwrite").option("header", "true").save(path_top_20)
free_to_play_sorted.write.format("csv").mode("overwrite").option("header", "true").save(path_top_free)
not_free_to_play_sorted.write.format("csv").mode("overwrite").option("header", "true").save(path_top_not_free)

In [8]:
# Stop the SparkSession
spark.stop()