In [12]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession

load_dotenv()

BRONZE_PATH = os.environ.get("BRONZE_PATH")
SILVER_PATH = os.environ.get("SILVER_PATH")
MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY")
MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY")

bronze_path_exchange = f"s3a:/{BRONZE_PATH}/vnstock3/symbol_exchange"
bronze_path_industry = f"s3a:/{BRONZE_PATH}/vnstock3/symbol_industry"
silver_path = f"s3a:/{SILVER_PATH}/vnstock3/symbol"

# Setup Spark sesssion
spark = (SparkSession.builder
    .master("spark://spark-master:7077")
    .appName("CompanyDataProcess")
    .getOrCreate()
)

print(bronze_path_exchange)
print(bronze_path_industry)
print(silver_path)

spark

s3a://dev/data/bronze/vnstock3/symbol_exchange
s3a://dev/data/bronze/vnstock3/symbol_industry
s3a://dev/data/silver/vnstock3/company_info/


In [18]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType, IntegerType, DateType
from pyspark.sql.functions import col, from_unixtime

In [21]:
schema_exchange = StructType([
    StructField("symbol", StringType(), False),
    StructField("id", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("exchange", StringType(), True),
    StructField("en_organ_name", StringType(), True),
    StructField("en_organ_short_name", StringType(), True),
    StructField("organ_short_name", StringType(), True),
    StructField("organ_name", StringType(), True),
    StructField("loaded_timestamp", StringType(), True),
    StructField("updated_at", DateType(), True)
])

raw_df_exchange = spark.read.schema(schema_exchange).json(bronze_path_exchange)

raw_df_exchange = (raw_df_exchange
    .withColumn(
        "loaded_timestamp",
        from_unixtime(col("loaded_timestamp").cast("long") / 1000).cast(TimestampType())  # Convert milliseconds to seconds
    )
)


In [22]:
raw_df_exchange.show()

+------+--------+-----+--------+--------------------+--------------------+--------------------+--------------------+-------------------+----------+
|symbol|      id| type|exchange|       en_organ_name| en_organ_short_name|    organ_short_name|          organ_name|   loaded_timestamp|updated_at|
+------+--------+-----+--------+--------------------+--------------------+--------------------+--------------------+-------------------+----------+
|   YTC| 8425620|STOCK|   UPCOM|Ho Chi Minh City ...|               YTECO|     XNK Y tế TP.HCM|Công ty Cổ phần X...|2024-12-21 07:52:52|2024-12-21|
|   YEG| 8424579|STOCK|     HSX|Yeah1 Group Corpo...|         Yeah1 Group|      Tập đoàn Yeah1|Công ty Cổ phần T...|2024-12-21 07:52:52|2024-12-21|
|   YBM| 8424470|STOCK|     HSX|Yen Bai Industry ...|Yen Bai Industry ...|Khoáng sản CN Yên...|Công ty Cổ phần K...|2024-12-21 07:52:52|2024-12-21|
|   YBC| 8425710|STOCK|   UPCOM|YenBai Cement and...|YenBai Cement and...|Xi măng và Khoáng...|Công ty Cổ phần X

In [25]:
# drop duplicate
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [27]:
window_spec = Window.partitionBy("symbol").orderBy(col("updated_at"), col("loaded_timestamp").desc())

df_exchange = raw_df_exchange.withColumn("row_number",
                  row_number().over(window_spec))

df_exchange = df_exchange.filter(col("row_number") == 1) \
                .drop("row_number")

In [36]:
df_exchange.filter((col("symbol") == "TV2")).show(20, truncate=False)

+------+-------+-----+--------+--------------------------------------------------+------------------------------+----------------+--------------------------------------+----------------+----------+
|symbol|id     |type |exchange|en_organ_name                                     |en_organ_short_name           |organ_short_name|organ_name                            |loaded_timestamp|updated_at|
+------+-------+-----+--------+--------------------------------------------------+------------------------------+----------------+--------------------------------------+----------------+----------+
|TV2   |8425012|STOCK|HSX     |Power Engineering Consulting Joint Stock Company 2|Power Engineering Consulting 2|Tư vấn XD Điện 2|Công ty Cổ phần Tư vấn Xây dựng Điện 2|null            |2024-11-01|
+------+-------+-----+--------+--------------------------------------------------+------------------------------+----------------+--------------------------------------+----------------+----------+



In [33]:
df_exchange.count()

2625

In [37]:
df_exchange.select('type').distinct().collect()

[Row(type='ETF'),
 Row(type='FU'),
 Row(type='UNIT_TRUST'),
 Row(type='DEBENTURE'),
 Row(type='CW'),
 Row(type='BOND'),
 Row(type='STOCK')]

In [40]:
df_exchange.where(col('type')=='STOCK').select('symbol').distinct().count()

1703

In [23]:
schema_industry = StructType([
    StructField("symbol", StringType(), False),
    StructField("organ_name", StringType(), True),
    StructField("en_organ_name", StringType(), True),
    StructField("icb_name3", StringType(), True),
    StructField("en_icb_name3", StringType(), True),
    StructField("icb_name2", StringType(), True),
    StructField("en_icb_name2", StringType(), True),
    StructField("icb_name4", StringType(), True),
    StructField("en_icb_name4", StringType(), True),
    StructField("com_type_code", StringType(), True),
    StructField("icb_code1", StringType(), True),
    StructField("icb_code2", StringType(), True),
    StructField("icb_code3", StringType(), True),
    StructField("icb_code4", StringType(), True),
    StructField("loaded_timestamp", StringType(), True),
    StructField("updated_at", DateType(), True)
])

raw_df_industry = spark.read.schema(schema_industry).json(bronze_path_industry)

In [29]:
df_industry = raw_df_industry.withColumn("row_number",
                  row_number().over(window_spec))

df_industry = df_industry.filter(col("row_number") == 1) \
                .drop("row_number")

In [30]:
df_industry.filter((col("symbol") == "TV2")).show(20, truncate=False)

+------+--------------------------------------+--------------------------------------------------+--------------------------+----------------+--------------------------+---------------------------+------------------+-------------------------+-------------+---------+---------+---------+---------+----------------+----------+
|symbol|organ_name                            |en_organ_name                                     |icb_name3                 |en_icb_name3    |icb_name2                 |en_icb_name2               |icb_name4         |en_icb_name4             |com_type_code|icb_code1|icb_code2|icb_code3|icb_code4|loaded_timestamp|updated_at|
+------+--------------------------------------+--------------------------------------------------+--------------------------+----------------+--------------------------+---------------------------+------------------+-------------------------+-------------+---------+---------+---------+---------+----------------+----------+
|TV2   |Công ty Cổ phần T

In [32]:
df_industry.count()

1594

In [41]:
df_exchange.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- exchange: string (nullable = true)
 |-- en_organ_name: string (nullable = true)
 |-- en_organ_short_name: string (nullable = true)
 |-- organ_short_name: string (nullable = true)
 |-- organ_name: string (nullable = true)
 |-- loaded_timestamp: timestamp (nullable = true)
 |-- updated_at: date (nullable = true)



In [42]:
df_industry.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- organ_name: string (nullable = true)
 |-- en_organ_name: string (nullable = true)
 |-- icb_name3: string (nullable = true)
 |-- en_icb_name3: string (nullable = true)
 |-- icb_name2: string (nullable = true)
 |-- en_icb_name2: string (nullable = true)
 |-- icb_name4: string (nullable = true)
 |-- en_icb_name4: string (nullable = true)
 |-- com_type_code: string (nullable = true)
 |-- icb_code1: string (nullable = true)
 |-- icb_code2: string (nullable = true)
 |-- icb_code3: string (nullable = true)
 |-- icb_code4: string (nullable = true)
 |-- loaded_timestamp: string (nullable = true)
 |-- updated_at: date (nullable = true)



In [45]:
# Perform the left outer join
df_joined = df_exchange.join(
    df_industry,
    on="symbol",  # Column to join on
    how="left"    # Left outer join
)

# Select or rename columns to remove ambiguity
df = df_joined.select(
    df_exchange["symbol"],
    df_exchange["id"],
    df_exchange["type"],
    df_exchange["exchange"],
    df_exchange["en_organ_name"].alias("exchange_en_organ_name"),
    df_exchange["en_organ_short_name"],
    df_exchange["organ_short_name"],
    df_exchange["organ_name"].alias("exchange_organ_name"),
    df_industry["icb_name3"],
    df_industry["en_icb_name3"],
    df_industry["icb_name2"],
    df_industry["en_icb_name2"],
    df_industry["icb_name4"],
    df_industry["en_icb_name4"],
    df_industry["com_type_code"],
    df_industry["icb_code1"],
    df_industry["icb_code2"],
    df_industry["icb_code3"],
    df_industry["icb_code4"],
    df_exchange["loaded_timestamp"].alias("exchange_loaded_timestamp"),
    df_industry["loaded_timestamp"].alias("industry_loaded_timestamp"),
    df_exchange["updated_at"]
)


+------+--------+-----+--------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---------+---------+---------+---------+-------------------------+-------------------------+----------+
|symbol|      id| type|exchange|exchange_en_organ_name| en_organ_short_name|    organ_short_name| exchange_organ_name|           icb_name3|        en_icb_name3|           icb_name2|        en_icb_name2|           icb_name4|        en_icb_name4|com_type_code|icb_code1|icb_code2|icb_code3|icb_code4|exchange_loaded_timestamp|industry_loaded_timestamp|updated_at|
+------+--------+-----+--------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---------+--------

In [47]:
df.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- exchange: string (nullable = true)
 |-- exchange_en_organ_name: string (nullable = true)
 |-- en_organ_short_name: string (nullable = true)
 |-- organ_short_name: string (nullable = true)
 |-- exchange_organ_name: string (nullable = true)
 |-- icb_name3: string (nullable = true)
 |-- en_icb_name3: string (nullable = true)
 |-- icb_name2: string (nullable = true)
 |-- en_icb_name2: string (nullable = true)
 |-- icb_name4: string (nullable = true)
 |-- en_icb_name4: string (nullable = true)
 |-- com_type_code: string (nullable = true)
 |-- icb_code1: string (nullable = true)
 |-- icb_code2: string (nullable = true)
 |-- icb_code3: string (nullable = true)
 |-- icb_code4: string (nullable = true)
 |-- exchange_loaded_timestamp: timestamp (nullable = true)
 |-- industry_loaded_timestamp: string (nullable = true)
 |-- updated_at: date (nullable = true)



In [57]:
from pyspark.sql.functions import current_timestamp
df = df.withColumn(
    "processing_time",
    current_timestamp()
)

In [58]:
df.write \
    .partitionBy("type") \
    .mode("overwrite") \
    .parquet(silver_path)

In [59]:
spark.stop()