In [1]:
from pyspark import SparkContext

sc = SparkContext("local", "Natural Numbers")
nums_rdd = sc.parallelize(range(1, 16))



In [2]:
print(nums_rdd.collect())  # Show elements
print(nums_rdd.getNumPartitions())  # Show number of partitions

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
1


In [3]:
# Used to print the first element in the list
first_num = nums_rdd.first()
print("First element:", first_num)


First element: 1


In [4]:
even_rdd = nums_rdd.filter(lambda x: x % 2 == 0)
print("Even Numbers:",even_rdd.collect())

Even Numbers: [2, 4, 6, 8, 10, 12, 14]


In [5]:
squared_rdd = nums_rdd.map(lambda x: x ** 2)
print("Squared_num:",squared_rdd.collect())

Squared_num: [1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225]


In [6]:
total_sum = nums_rdd.reduce(lambda x, y: x + y)
print("Sum of nums:",total_sum)

Sum of nums: 120


In [7]:
nums_rdd.saveAsTextFile("natural_numbers.txt")

In [8]:
more_nums_rdd = sc.parallelize([16, 17, 18, 19, 20])
combined_rdd = nums_rdd.union(more_nums_rdd)
print("Combined_nums:",combined_rdd.collect())

Combined_nums: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [9]:
cartesian_rdd = nums_rdd.cartesian(more_nums_rdd)
print("Cartesian Product:",cartesian_rdd.collect())

Cartesian Product: [(1, 16), (1, 17), (1, 18), (1, 19), (1, 20), (2, 16), (2, 17), (2, 18), (2, 19), (2, 20), (3, 16), (3, 17), (3, 18), (3, 19), (3, 20), (4, 16), (4, 17), (4, 18), (4, 19), (4, 20), (5, 16), (5, 17), (5, 18), (5, 19), (5, 20), (6, 16), (6, 17), (6, 18), (6, 19), (6, 20), (7, 16), (7, 17), (7, 18), (7, 19), (7, 20), (8, 16), (8, 17), (8, 18), (8, 19), (8, 20), (9, 16), (9, 17), (9, 18), (9, 19), (9, 20), (10, 16), (10, 17), (10, 18), (10, 19), (10, 20), (11, 16), (11, 17), (11, 18), (11, 19), (11, 20), (12, 16), (12, 17), (12, 18), (12, 19), (12, 20), (13, 16), (13, 17), (13, 18), (13, 19), (13, 20), (14, 16), (14, 17), (14, 18), (14, 19), (14, 20), (15, 16), (15, 17), (15, 18), (15, 19), (15, 20)]


In [19]:
dict_rdd = sc.parallelize([{"name": "pavan", "age": 20},
                           {"name": "Bobby", "age": 21},
                           {"name": "praveen", "age": 25}])
print("Dictionary RDD:",dict_rdd.collect())

Dictionary RDD: [{'name': 'pavan', 'age': 20}, {'name': 'Bobby', 'age': 21}, {'name': 'praveen', 'age': 25}]


In [11]:
count_rdd = dict_rdd.flatMap(lambda x: x.items()).map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x + y)
print(count_rdd.collect())

[('name', 3), ('age', 3)]


In [12]:
file_rdd = sc.textFile("file1.txt").union(sc.textFile("file2.txt"))
print(file_rdd.collect())

['hi how are you', 'hello where are yoou']


In [13]:
print(file_rdd.take(5))

['hi how are you', 'hello where are yoou']


In [17]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataFrame and Dataset").getOrCreate()
data = [(1, "pavan"), (2, "Bobby"), (3, "praveen")]
columns = ["id", "name"]
df = spark.createDataFrame(data, columns)

# Show DataFrame
df.show()

+---+-------+
| id|   name|
+---+-------+
|  1|  pavan|
|  2|  Bobby|
|  3|praveen|
+---+-------+



In [18]:
# RDD Example
rdd_example = sc.parallelize([1, 2, 3, 4])
print("RDD:", rdd_example.collect())

# DataFrame Example
df_example = spark.createDataFrame([(1, "pavan"), (2, "Bobby"), (3, "praveen")], ["id", "name"])
df_example.show()

# In PySpark, DataFrame is already a Dataset

RDD: [1, 2, 3, 4]
+---+-------+
| id|   name|
+---+-------+
|  1|  pavan|
|  2|  Bobby|
|  3|praveen|
+---+-------+

