# 1. How to import PySpark and check the version?

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Analysis").getOrCreate()



Picked up _JAVA_OPTIONS: -Dawt.useSystemAAFontSettings=on -Dswing.aatext=true
Picked up _JAVA_OPTIONS: -Dawt.useSystemAAFontSettings=on -Dswing.aatext=true
24/08/02 01:58:45 WARN Utils: Your hostname, kali resolves to a loopback address: 127.0.1.1; using 192.168.87.160 instead (on interface eth0)
24/08/02 01:58:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/02 01:58:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
print(spark.version)


3.5.1


# 2. How to convert the index of a PySpark DataFrame into a column?

In [4]:
# Assuming df is your DataFrame
df = spark.createDataFrame([
("Alice", 1),
("Bob", 2),
("Charlie", 3),
], ["Name", "Value"])

df.show()

                                                                                

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [7]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number,monotonically_increasing_id

w = Window.orderBy(monotonically_increasing_id())
df = df.withColumn("Index",row_number().over(w)-1)

df.show()

24/08/01 13:18:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 13:18:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 13:18:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

+-------+-----+-----+
|   Name|Value|Index|
+-------+-----+-----+
|  Alice|    1|    0|
|    Bob|    2|    1|
|Charlie|    3|    2|
+-------+-----+-----+



# 3. How to combine many lists to form a PySpark DataFrame?

In [8]:
# Define your lists
list1 = ["a", "b", "c", "d"]
list2 = [1, 2, 3, 4]


In [9]:
import pandas as pd
pd.DataFrame([list1,list2])

ModuleNotFoundError: No module named 'pandas'

In [14]:
rdd = spark.sparkContext.parallelize(list(zip(list1,list2)))


df = rdd.toDF(["column1","column2"])

df.show()

                                                                                

+-------+-------+
|column1|column2|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
|      d|      4|
+-------+-------+



                                                                                

# 4. How to get the items of list A not present in list B?

In [15]:
list_A = [1, 2, 3, 4, 5]
list_B = [4, 5, 6, 7, 8]
import pandas as pd
df = pd.DataFrame([list_A,list_B],schema=['A',"B"])
df

In [26]:
import pandas as pd
df = pd.DataFrame({
    "A":list_A,
    "B":list_B
})
df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6
3,4,7
4,5,8


In [32]:
from pyspark.sql.types import StructField,StructType,IntegerType
data = list(zip(list_A,list_B))

data

[(1, 4), (2, 5), (3, 6), (4, 7), (5, 8)]

In [33]:
schema = StructType([
    StructField("A", IntegerType(), True),
    StructField("B", IntegerType(), True)

])

df = spark.createDataFrame(data,schema=schema)
df.show()

+---+---+
|  A|  B|
+---+---+
|  1|  4|
|  2|  5|
|  3|  6|
|  4|  7|
|  5|  8|
+---+---+



In [37]:
sc = spark.sparkContext

rdd_A = sc.parallelize(list_A)
rdd_B = sc.parallelize(list_B)

result_rdd = rdd_A.subtract(rdd_B)
result_rdd.collect()


                                                                                

[1, 2, 3]

# 6. How to get the minimum, 25th percentile, median, 75th, and max of a numeric column?

In [3]:
# Create a sample DataFrame
data = [("A", 10), ("B", 20), ("C", 30), ("D", 40), ("E", 50), ("F", 15), ("G", 28), ("H", 54), ("I", 41), ("J", 86)]
df = spark.createDataFrame(data, ["Name", "Age"])

df.show()

                                                                                

+----+---+
|Name|Age|
+----+---+
|   A| 10|
|   B| 20|
|   C| 30|
|   D| 40|
|   E| 50|
|   F| 15|
|   G| 28|
|   H| 54|
|   I| 41|
|   J| 86|
+----+---+



In [4]:
quantiles = df.approxQuantile("Age",[0.0, 0.25, 0.5, 0.75, 1.0], 0.01)

print("Min: ", quantiles[0])
print("25th percentile: ", quantiles[1])
print("Median: ", quantiles[2])
print("75th percentile: ", quantiles[3])
print("Max: ", quantiles[4])


                                                                                

Min:  10.0
25th percentile:  20.0
Median:  30.0
75th percentile:  50.0
Max:  86.0


In [6]:
df.summary().show()

[Stage 5:>                                                          (0 + 1) / 1]

+-------+----+------------------+
|summary|Name|               Age|
+-------+----+------------------+
|  count|  10|                10|
|   mean|NULL|              37.4|
| stddev|NULL|22.396428286671068|
|    min|   A|                10|
|    25%|NULL|                20|
|    50%|NULL|                30|
|    75%|NULL|                50|
|    max|   J|                86|
+-------+----+------------------+



                                                                                

In [10]:
from pyspark.sql.functions import sum
df.agg(sum("Age")).collect()[0][0]

                                                                                

374

# 7. How to get frequency counts of unique items of a column?


In [11]:
from pyspark.sql import Row

# Sample data
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]

# create DataFrame
df = spark.createDataFrame(data)

# show DataFrame
df.show()

+----+---------+
|name|      job|
+----+---------+
|John| Engineer|
|John| Engineer|
|Mary|Scientist|
| Bob| Engineer|
| Bob| Engineer|
| Bob|Scientist|
| Sam|   Doctor|
+----+---------+



                                                                                

In [13]:
df.groupBy("job").count().show()



+---------+-----+
|      job|count|
+---------+-----+
| Engineer|    4|
|Scientist|    2|
|   Doctor|    1|
+---------+-----+



                                                                                

# 8. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?


In [14]:
from pyspark.sql import Row

# Sample data
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]

# create DataFrame
df = spark.createDataFrame(data)

# show DataFrame
df.show()

+----+---------+
|name|      job|
+----+---------+
|John| Engineer|
|John| Engineer|
|Mary|Scientist|
| Bob| Engineer|
| Bob| Engineer|
| Bob|Scientist|
| Sam|   Doctor|
+----+---------+



                                                                                

In [16]:
df.createOrReplaceTempView("table1")

query = """ 
SELECT job, COUNT(*) AS count 
FROM table1 
GROUP BY job 
LIMIT 2
"""

spark.sql(query).show()




+---------+-----+
|      job|count|
+---------+-----+
| Engineer|    4|
|Scientist|    2|
+---------+-----+



                                                                                

In [18]:
from pyspark.sql.functions import col,when

top_2_jobs = df.groupBy("job").count().orderBy("count",ascending=False).limit(2)
top_2_jobs.show()



+---------+-----+
|      job|count|
+---------+-----+
| Engineer|    4|
|Scientist|    2|
+---------+-----+



                                                                                

In [21]:
top_2_jobs = df.groupBy("job").count().orderBy("count",ascending=False).limit(2).rdd.flatMap(lambda x: x).collect()
print(top_2_jobs)




['Engineer', 4, 'Scientist', 2]


                                                                                

In [22]:
df = df.withColumn('job', when(col('job').isin(top_2_jobs), col('job')).otherwise("others"))
df.show()



+----+---------+
|name|      job|
+----+---------+
|John| Engineer|
|John| Engineer|
|Mary|Scientist|
| Bob| Engineer|
| Bob| Engineer|
| Bob|Scientist|
| Sam|   others|
+----+---------+



                                                                                

# 9. How to Drop rows with NA values specific to a particular column?

In [23]:
# Assuming df is your DataFrame
df = spark.createDataFrame([
("A", 1, None),
("B", None, "123" ),
("B", 3, "456"),
("D", None, None),
], ["Name", "Value", "id"])

df.show()



+----+-----+----+
|Name|Value|  id|
+----+-----+----+
|   A|    1|NULL|
|   B| NULL| 123|
|   B|    3| 456|
|   D| NULL|NULL|
+----+-----+----+



                                                                                

In [24]:
df2 = df.dropna(subset=['Value'])
df2.show()



+----+-----+----+
|Name|Value|  id|
+----+-----+----+
|   A|    1|NULL|
|   B|    3| 456|
+----+-----+----+



                                                                                

# 10. How to rename columns of a PySpark DataFrame using two lists – one containing the old column names and the other containing the new column names?

In [25]:
# suppose you have the following DataFrame
df = spark.createDataFrame([(1, 2, 3), (4, 5, 6)], ["col1", "col2", "col3"])

# old column names
old_names = ["col1", "col2", "col3"]

# new column names
new_names = ["new_col1", "new_col2", "new_col3"]

df.show()

+----+----+----+
|col1|col2|col3|
+----+----+----+
|   1|   2|   3|
|   4|   5|   6|
+----+----+----+



                                                                                

In [27]:
for old_column,new_column in zip(old_names,new_names):
    df =df.withColumnRenamed(old_column,new_column)

df.show()




+--------+--------+--------+
|new_col1|new_col2|new_col3|
+--------+--------+--------+
|       1|       2|       3|
|       4|       5|       6|
+--------+--------+--------+



                                                                                