In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import datediff, current_date
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType
import random
from faker import Faker

In [4]:
# Khởi tạo SparkSession sinh dữ liệu
spark = SparkSession.builder.appName("Generate and Save Parquet Data").getOrCreate()

23/09/15 23:02:43 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.1.123 instead (on interface wlp0s20f3)
23/09/15 23:02:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/15 23:02:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
# Tạo schema cho DataFrame
schema = StructType([
    StructField("Name", StringType(), False),
    StructField("Birthdate", DateType(), False),
    StructField("Address", StringType(), False),
    StructField("Gender", StringType(), False)
])

In [15]:
# Tạo danh sách chứa 100 bản ghi ngẫu nhiên
fake = Faker()
data = []
for _ in range(1000):
    name = Faker().name()
    birthdate = fake.date_of_birth(minimum_age=1, maximum_age=100)
    address = fake.address()
    gender = random.choice(["Male", "Female"])
    record = (name, birthdate, address, gender)
    data.append(record)

In [16]:
# Tạo DataFrame từ danh sách dữ liệu và schema
df = spark.createDataFrame(data, schema=schema)

In [23]:
# Tính toán độ tuổi và thêm cột "age" vào DataFrame
df = df.withColumn("age", (datediff(current_date(), "birthdate") / 365).cast(IntegerType()))

In [24]:
# Lưu DataFrame thành tệp Parquet
parquet_output_path = "/home/hoanghainam/DE_internship/output.parquet"
df.write.mode("overwrite").parquet(parquet_output_path)

23/09/15 14:31:29 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/09/15 14:31:29 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
23/09/15 14:31:29 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
23/09/15 14:31:29 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
23/09/15 14:31:29 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
23/09/15 14:31:29 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
23/09/15 14:31:29 WARN MemoryManager: Total allocation exceeds 95.

In [5]:
# Đọc tệp Parquet và chuyển thành DataFrame
parquet_file_path = "/home/hoanghainam/DE_internship/output.parquet"
df = spark.read.parquet(parquet_file_path)

In [6]:
# Hiển thị dữ liệu trong DataFrame
df.show()

+-----------------+----------+--------------------+------+---+
|             Name| Birthdate|             Address|Gender|age|
+-----------------+----------+--------------------+------+---+
|   Jeremy Perkins|2000-02-27|USCGC Morgan\nFPO...|  Male| 23|
|     Karen Parker|1948-06-27|165 Hatfield Fork...|Female| 75|
|  Matthew Simmons|1983-08-01|8692 Billy Cliff\...|Female| 40|
|     Anne Bullock|1974-07-08|6351 Lopez Mills\...|  Male| 49|
|    Jeffrey Allen|1981-12-10|74474 Robinson Ca...|Female| 41|
|    Micheal Green|1951-10-28|1463 Nicole Villa...|Female| 71|
|       Ashley Gay|2017-10-18|9140 Hubbard Port...|  Male|  5|
|     Angela Mckay|1958-05-25|76885 Jason Stree...|  Male| 65|
|  Edward Benjamin|1941-01-28|6235 Jennifer Run...|  Male| 82|
| Elizabeth Obrien|2003-03-28|51190 Becky Trail...|Female| 20|
|   Eduardo Graham|1977-06-18|PSC 7096, Box 512...|  Male| 46|
|      Brent Grant|1939-11-12|38731 Hernandez C...|Female| 83|
|     Donna Acosta|1969-10-26|452 Lisa Mill\nFl...|  Ma

In [13]:
# Dừng phiên làm việc Spark
spark.stop()