# 1. How to import PySpark and check the version?

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Analysis").getOrCreate()



Picked up _JAVA_OPTIONS: -Dawt.useSystemAAFontSettings=on -Dswing.aatext=true
Picked up _JAVA_OPTIONS: -Dawt.useSystemAAFontSettings=on -Dswing.aatext=true
24/08/01 13:11:11 WARN Utils: Your hostname, kali resolves to a loopback address: 127.0.1.1; using 192.168.216.160 instead (on interface eth0)
24/08/01 13:11:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/01 13:11:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/01 13:11:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
print(spark.version)


3.5.1


# 2. How to convert the index of a PySpark DataFrame into a column?

In [4]:
# Assuming df is your DataFrame
df = spark.createDataFrame([
("Alice", 1),
("Bob", 2),
("Charlie", 3),
], ["Name", "Value"])

df.show()

                                                                                

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [7]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number,monotonically_increasing_id

w = Window.orderBy(monotonically_increasing_id())
df = df.withColumn("Index",row_number().over(w)-1)

df.show()

24/08/01 13:18:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 13:18:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 13:18:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

+-------+-----+-----+
|   Name|Value|Index|
+-------+-----+-----+
|  Alice|    1|    0|
|    Bob|    2|    1|
|Charlie|    3|    2|
+-------+-----+-----+



# 3. How to combine many lists to form a PySpark DataFrame?

In [8]:
# Define your lists
list1 = ["a", "b", "c", "d"]
list2 = [1, 2, 3, 4]


In [9]:
import pandas as pd
pd.DataFrame([list1,list2])

ModuleNotFoundError: No module named 'pandas'

In [14]:
rdd = spark.sparkContext.parallelize(list(zip(list1,list2)))


df = rdd.toDF(["column1","column2"])

df.show()

                                                                                

+-------+-------+
|column1|column2|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
|      d|      4|
+-------+-------+



                                                                                

# 4. How to get the items of list A not present in list B?

In [15]:
list_A = [1, 2, 3, 4, 5]
list_B = [4, 5, 6, 7, 8]
import pandas as pd
df = pd.DataFrame([list_A,list_B],schema=['A',"B"])
df

In [26]:
import pandas as pd
df = pd.DataFrame({
    "A":list_A,
    "B":list_B
})
df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6
3,4,7
4,5,8


In [32]:
from pyspark.sql.types import StructField,StructType,IntegerType
data = list(zip(list_A,list_B))

data

[(1, 4), (2, 5), (3, 6), (4, 7), (5, 8)]

In [33]:
schema = StructType([
    StructField("A", IntegerType(), True),
    StructField("B", IntegerType(), True)

])

df = spark.createDataFrame(data,schema=schema)
df.show()

+---+---+
|  A|  B|
+---+---+
|  1|  4|
|  2|  5|
|  3|  6|
|  4|  7|
|  5|  8|
+---+---+



In [37]:
sc = spark.sparkContext

rdd_A = sc.parallelize(list_A)
rdd_B = sc.parallelize(list_B)

result_rdd = rdd_A.subtract(rdd_B)
result_rdd.collect()


                                                                                

[1, 2, 3]

# 6. How to get the minimum, 25th percentile, median, 75th, and max of a numeric column?