In [1]:
# Bài thực hành số 2 - Exercise 1: Tạo RDD và phân vùng dữ liệu
# Author: Boss Thuần
# Clean setup for pip-installed PySpark

import os
import sys

# Clear any existing Spark environment variables
spark_env_vars = ['SPARK_HOME', 'SPARK_LOCAL_DIRS', 'SPARK_CONF_DIR']
for var in spark_env_vars:
    if var in os.environ:
        del os.environ[var]

# Set clean environment for pip-installed PySpark
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

print("🧹 Environment cleaned")
print("🔧 Using Python:", sys.executable)


🧹 Environment cleaned
🔧 Using Python: /home/thuannp4/Development/Python_Projects/bigdata_course/spark_env/bin/python


In [2]:
from pyspark import SparkContext, SparkConf

# Create configuration for local mode
conf = SparkConf().setAppName("Lab2_Exercise1").setMaster("local[*]")

# Create SparkContext
sc = SparkContext(conf=conf)

print("✅ SparkContext initialized successfully!")
print(f"✅ Spark version: {sc.version}")
print(f"✅ Application name: {sc.appName}")
print(f"✅ Master: {sc.master}")

25/09/25 10:17:07 WARN Utils: Your hostname, thuan-precision-5560 resolves to a loopback address: 127.0.1.1; using 192.168.1.5 instead (on interface wlp0s20f3)
25/09/25 10:17:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/25 10:17:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/09/25 10:17:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


✅ SparkContext initialized successfully!
✅ Spark version: 3.5.6
✅ Application name: Lab2_Exercise1
✅ Master: local[*]


In [3]:
rdd1 = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 3)
n = rdd1.getNumPartitions()
print(n)

result = rdd1.glom().collect()
print(result)

3
[[1, 2, 3], [4, 5, 6], [7, 8, 9, 10]]


In [4]:
# ===== BÀI 1b: XÂU KÝ TỰ =====
print("=== b) Một xâu ký tự ===")

string_data = "Hello Spark"
rdd2 = sc.parallelize(string_data, 5)
print(f"number of partitions: {rdd2.getNumPartitions()}")
print(f"RDD from string: {rdd2.glom().collect()}")


=== b) Một xâu ký tự ===
number of partitions: 5
RDD from string: [['H', 'e'], ['l', 'l'], ['o', ' '], ['S', 'p'], ['a', 'r', 'k']]


In [5]:
# ===== BÀI 1c: DANH SÁCH CÁC XÂU KÝ TỰ =====
print("=== c) Một danh sách các xâu ký tự ===")

string_list = ["Apache", "Spark", "Big", "Data", "Analytics"]
rdd3 = sc.parallelize(string_list, 3)
print(f"Số phân vùng: {rdd3.getNumPartitions()}")
print(f"Phân vùng chi tiết: {rdd3.glom().collect()}")

=== c) Một danh sách các xâu ký tự ===
Số phân vùng: 3
Phân vùng chi tiết: [['Apache'], ['Spark', 'Big'], ['Data', 'Analytics']]


In [6]:
data_dict = {
    'hello': 'xin chào',
    'goodbye': 'tạm biệt',
    'thank you': 'cảm ơn',
    'please': 'xin',
    'sorry': 'xin lỗi',
    'yes': 'vâng'
}

rdd4 = sc.parallelize(data_dict, 4)
print(f"Detaied partitions: {rdd4.glom().collect()}")

rdd4 = sc.parallelize(data_dict.items(), 4) # convert dict to list or tuple for leveraging full PySpark support
print(f"Detaied partitions after converted: {rdd4.glom().collect()}")

Detaied partitions: [['hello'], ['goodbye', 'thank you'], ['please'], ['sorry', 'yes']]
Detaied partitions after converted: [[('hello', 'xin chào')], [('goodbye', 'tạm biệt'), ('thank you', 'cảm ơn')], [('please', 'xin')], [('sorry', 'xin lỗi'), ('yes', 'vâng')]]


In [7]:
mixed_data = [
    42,                           # số nguyên
    "Hello World",               # xâu ký tự
    [1, 2, 3],                  # list
    (4, 5, 6),                  # tuple
    {"key": "value"},           # dict
    3.14,                       # số thực
    True                        # boolean
]

rdd5 = sc.parallelize(mixed_data, 4)
print(f"Detaied partitions: {rdd5.glom().collect()}")

Detaied partitions: [[42], ['Hello World', [1, 2, 3]], [(4, 5, 6), {'key': 'value'}], [3.14, True]]


In [8]:
# ===== BÀI 1f: TEXT FILE =====
print("=== f) Một text file ===")

import os
data_path = "../../data/README.md"

with open(data_path, "r") as f:
    content = f.read()

print("Content of README file:")
print(content)

rdd6 = sc.textFile(data_path, 3)
print("read file by Spark:")
lines = rdd6.collect()
for i, line in enumerate(lines, 1):
    print(f" Line {i}: {line}")

print(f"Detaied partitions: {rdd6.glom().collect()}")


=== f) Một text file ===
Content of README file:
# Welcome to Apache Spark
Apache Spark is a powerful big data processing engine.
This lab teaches Spark RDD operations.
Spark provides distributed computing capabilities.
Big Data Analytics with Spark framework.
Python is great for data science.

read file by Spark:
 Line 1: # Welcome to Apache Spark
 Line 2: Apache Spark is a powerful big data processing engine.
 Line 3: This lab teaches Spark RDD operations.
 Line 4: Spark provides distributed computing capabilities.
 Line 5: Big Data Analytics with Spark framework.
 Line 6: Python is great for data science.
Detaied partitions: [['# Welcome to Apache Spark', 'Apache Spark is a powerful big data processing engine.', 'This lab teaches Spark RDD operations.'], ['Spark provides distributed computing capabilities.'], ['Big Data Analytics with Spark framework.', 'Python is great for data science.']]
