In [32]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import col, regexp_replace, trim

# Tạo SparkSession
spark = SparkSession.builder.appName("SimpleSparkApp").getOrCreate()

In [41]:
# Đường dẫn đến tệp JSON
json_file_path = "../crawl/data/bitcoin.json"

# Đọc tệp JSON vào DataFrame
df = spark.read.json(json_file_path)

# Loại bỏ các tag HTML và xóa khoảng trắng
df_processed = (
    df.withColumn("content", regexp_replace("content", "<.*?>", ""))
    .withColumn("content", regexp_replace("content", "\\s+", " "))
    .withColumn("content", trim(col("content")))
)

# Hiển thị DataFrame
df_processed.show()

+---------------+--------------------+--------------------+----------+------+-------------------+--------------------+-----+-----+
|_corrupt_record|            category|             content|num_answer|solved|               time|               title|views|votes|
+---------------+--------------------+--------------------+----------+------+-------------------+--------------------+-----+-----+
|           NULL|[blockchain-explo...|Closed. This ques...|         0| false|2023-11-22T12:56:11|⁰Can someone help...|   20|    0|
|           NULL|[wallet, private-...|This question alr...|         0| false|2023-11-21T20:31:54|cannot export my ...|   31|    0|
|           NULL|[transactions, ce...|As of today Nov 2...|         1|  true|2023-11-21T22:17:28|Supposedly six OF...|   62|    0|
|           NULL|[lightning-networ...|Every day I find ...|         0| false|2023-11-22T08:24:03|Are there interna...|   13|    2|
|           NULL|[bitcoin-core, pe...|I'd like to fine-...|         0| false|2023-1

In [42]:
df_processed.select("category").show(truncate=False)

+-----------------------------------------------------------------------------+
|category                                                                     |
+-----------------------------------------------------------------------------+
|[blockchain-explorer]                                                        |
|[wallet, private-key-export]                                                 |
|[transactions, censorship-resistance]                                        |
|[lightning-network, lightning-network-daemon, invoices]                      |
|[bitcoin-core, performance]                                                  |
|[bitcoin-cash, bitcoinjs, bip174-psbt]                                       |
|[private-key, bip32-hd-wallets, signature, taproot, fraud]                   |
|[private-key, wallet-recovery, recover-private-key, funds, zerocoin]         |
|[bitcoind, linux]                                                            |
|[bitcoind, linux, bitcoin.conf, memory]

In [34]:
# Đường dẫn đến tệp JSON đầu ra
output_json_path = "./output"

# Ghi DataFrame ra tệp JSON với mode là "overwrite"
df_processed.write.mode("overwrite").json(output_json_path)

In [None]:
# Đóng SparkSession
spark.stop()