In [8]:
from pyspark.sql import SparkSession, functions as F

spark = (SparkSession.builder
                    .appName('thailand')
                    .getOrCreate()
        )

In [9]:
df = spark.read.parquet("thailand_domestic_tourism_2019_2023_ver2.parquet")


In [3]:
df

DataFrame[date: timestamp_ntz, province_thai: string, province_eng: string, region_thai: string, region_eng: string, variable: string, value: double]

In [4]:
df.columns

['date',
 'province_thai',
 'province_eng',
 'region_thai',
 'region_eng',
 'variable',
 'value']

In [21]:
df.head(10)

[Row(date=datetime.datetime(2019, 1, 1, 0, 0), province_thai='กรุงเทพมหานคร', province_eng='Bangkok', region_thai='ภาคกลาง', region_eng='central', variable='ratio_tourist_stay', value=93.37),
 Row(date=datetime.datetime(2019, 1, 1, 0, 0), province_thai='ลพบุรี', province_eng='Lopburi ', region_thai='ภาคกลาง', region_eng='central', variable='ratio_tourist_stay', value=61.32),
 Row(date=datetime.datetime(2019, 1, 1, 0, 0), province_thai='พระนครศรีอยุธยา', province_eng='Phra Nakhon Si Ayutthaya ', region_thai='ภาคกลาง', region_eng='central', variable='ratio_tourist_stay', value=73.37),
 Row(date=datetime.datetime(2019, 1, 1, 0, 0), province_thai='สระบุรี', province_eng='Saraburi ', region_thai='ภาคกลาง', region_eng='central', variable='ratio_tourist_stay', value=67.33),
 Row(date=datetime.datetime(2019, 1, 1, 0, 0), province_thai='ชัยนาท', province_eng='Chainat ', region_thai='ภาคกลาง', region_eng='central', variable='ratio_tourist_stay', value=79.31),
 Row(date=datetime.datetime(2019, 1,

In [10]:
pivoted_df = df.groupBy("date", "province_eng") \
    .pivot("variable") \
    .agg(F.first("value"))

In [12]:
pivoted_df.head()

Row(date=datetime.datetime(2019, 5, 1, 0, 0), province_eng='Ubon Ratchathani ', no_tourist_all=282088.0, no_tourist_foreign=12953.0, no_tourist_stay=111533.0, no_tourist_thai=269135.0, ratio_tourist_stay=65.74, revenue_all=645720000.0, revenue_foreign=51370000.0, revenue_thai=594350000.0)

In [13]:
result_df = pivoted_df.select(
    "date",
    "province_eng",
    (F.col("no_tourist_foreign") / F.col("no_tourist_all")).alias("no_percentage_of_foreign_tourists"),
    (F.col("revenue_foreign") / F.col("revenue_all")).alias("revenue_percentage_of_foreign_tourists")
)

In [17]:
import json

rows = result_df.toJSON().collect()

with open('output.json', 'w') as f:
    json.dump(rows, f)

In [None]:
df_final = (df_no_all
    .join(df_no_foreign, on=["date", "province_eng"])
    .join(df_rev_all, on=["date", "province_eng"])
    .join(df_rev_foreign, on=["date", "province_eng"])
    .selectExpr(
        "date"
        ,"province_eng"
        ,"no_tourist_foreign / no_tourist_all as no_percentage_of_foreign_tourists"
        ,"revenue_foreign / revenue_all as revenue_percentage_of_foreign_tourists"
    )
)

# Write to JSON file
df_final.write.json("challenge.json")