In [71]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import row_number, lit
from pyspark.sql.window import Window

# Create a SparkSession
spark = SparkSession.builder.appName("MySparkApp").getOrCreate()

1) How to convert the index of a PySpark DataFrame into a column?

```python
# Input: Assuming df is your DataFrame
df = spark.createDataFrame([
("Alice", 1),
("Bob", 2),
("Charlie", 3),
], ["Name", "Value"])

df.show()

+-------+-----+
| Name|Value|
+-------+-----+
| Alice| 1|
| Bob| 2|
|Charlie| 3|
+-------+-----+

# Output:
+-------+-----+-----+
| Name|Value|index|
+-------+-----+-----+
| Alice| 1| 0|
| Bob| 2| 1|
|Charlie| 3| 2|
+-------+-----+-----+
```

In [32]:
df = spark.createDataFrame([
("Alice", 1),
("Bob", 2),
("Charlie", 3),
], ["Name", "Value"])

df.show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [33]:
w = Window.orderBy(lit(1))
df = df.withColumn("index", row_number().over(w) - 1)

In [34]:
df.show()

+-------+-----+-----+
|   Name|Value|index|
+-------+-----+-----+
|  Alice|    1|    0|
|    Bob|    2|    1|
|Charlie|    3|    2|
+-------+-----+-----+



2) How to get the minimum, 25th percentile, median, 75th, and max of a numeric column?

Compute the minimum, 25th percentile, median, 75th, and maximum of column `Age`

```python
# Create a sample DataFrame
data = [("A", 10), ("B", 20), ("C", 30), ("D", 40), ("E", 50), ("F", 15), ("G", 28), ("H", 54), ("I", 41), ("J", 86)]
df = spark.createDataFrame(data, ["Name", "Age"])
```

In [35]:
data = [("A", 10), ("B", 20), ("C", 30), ("D", 40), ("E", 50), ("F", 15), ("G", 28), ("H", 54), ("I", 41), ("J", 86)]
df = spark.createDataFrame(data, ["Name", "Age"])

In [36]:
df.show()

+----+---+
|Name|Age|
+----+---+
|   A| 10|
|   B| 20|
|   C| 30|
|   D| 40|
|   E| 50|
|   F| 15|
|   G| 28|
|   H| 54|
|   I| 41|
|   J| 86|
+----+---+



In [37]:
df.summary("min", "25%", "50%", "75%", "max").select("summary","Age").show()

+-------+---+
|summary|Age|
+-------+---+
|    min| 10|
|    25%| 20|
|    50%| 30|
|    75%| 50|
|    max| 86|
+-------+---+



3) Calculte the frequency counts of each unique value

```python
from pyspark.sql import Row

# Sample data
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]

# create DataFrame
df = spark.createDataFrame(data)
```

In [38]:
from pyspark.sql import Row

# Sample data
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]

# create DataFrame
df = spark.createDataFrame(data)

In [39]:
df.show()

+----+---------+
|name|      job|
+----+---------+
|John| Engineer|
|John| Engineer|
|Mary|Scientist|
| Bob| Engineer|
| Bob| Engineer|
| Bob|Scientist|
| Sam|   Doctor|
+----+---------+



In [47]:
df.groupBy("name","job").count().show()

+----+---------+-----+
|name|      job|count|
+----+---------+-----+
|Mary|Scientist|    1|
|John| Engineer|    2|
| Sam|   Doctor|    1|
| Bob| Engineer|    2|
| Bob|Scientist|    1|
+----+---------+-----+



4) How to keep only top 2 most frequent values as it is and replace everything else as `Other`?

```python
from pyspark.sql import Row

# Sample data
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]

# create DataFrame
df = spark.createDataFrame(data)
```

In [48]:
from pyspark.sql import Row

# Sample data
data = [
Row(name='John', job='Engineer'),
Row(name='John', job='Engineer'),
Row(name='Mary', job='Scientist'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Engineer'),
Row(name='Bob', job='Scientist'),
Row(name='Sam', job='Doctor'),
]

# create DataFrame
df = spark.createDataFrame(data)

In [49]:
df.show()

+----+---------+
|name|      job|
+----+---------+
|John| Engineer|
|John| Engineer|
|Mary|Scientist|
| Bob| Engineer|
| Bob| Engineer|
| Bob|Scientist|
| Sam|   Doctor|
+----+---------+



In [61]:
df.groupBy("name","job").count().orderBy("count", ascending=False).limit(2).show()

+----+--------+-----+
|name|     job|count|
+----+--------+-----+
|John|Engineer|    2|
| Bob|Engineer|    2|
+----+--------+-----+



5) How to rename columns of a PySpark DataFrame using two lists – one containing the old column names and the other containing the new column names?

```python
# suppose you have the following DataFrame
df = spark.createDataFrame([(1, 2, 3), (4, 5, 6)], ["col1", "col2", "col3"])

# old column names
old_names = ["col1", "col2", "col3"]

# new column names
new_names = ["new_col1", "new_col2", "new_col3"]
```

In [62]:
# suppose you have the following DataFrame
df = spark.createDataFrame([(1, 2, 3), (4, 5, 6)], ["col1", "col2", "col3"])

# old column names
old_names = ["col1", "col2", "col3"]

# new column names
new_names = ["new_col1", "new_col2", "new_col3"]

In [64]:
df.show()

+----+----+----+
|col1|col2|col3|
+----+----+----+
|   1|   2|   3|
|   4|   5|   6|
+----+----+----+



In [66]:
for old_names, new_names in zip(old_names, new_names):
    df = df.withColumnRenamed(old_names, new_names)

df.show()

+--------+--------+--------+
|new_col1|new_col2|new_col3|
+--------+--------+--------+
|       1|       2|       3|
|       4|       5|       6|
+--------+--------+--------+



6) How to bin a numeric list to 10 groups of equal size?

```python
from pyspark.sql.functions import rand
from pyspark.ml.feature import Bucketizer

# Create a DataFrame with a single column "values" filled with random numbers
num_items = 100
df = spark.range(num_items).select(rand(seed=42).alias("values"))
```

In [67]:
from pyspark.sql.functions import rand
from pyspark.ml.feature import Bucketizer

# Create a DataFrame with a single column "values" filled with random numbers
num_items = 100
df = spark.range(num_items).select(rand(seed=42).alias("values"))

In [68]:
df.show()

+--------------------+
|              values|
+--------------------+
|   0.619189370225301|
|  0.5096018842446481|
|  0.8325259388871524|
| 0.26322809041172357|
|  0.6702867696264135|
|  0.5173283545794627|
|  0.9991441647585968|
| 0.06993233728279169|
|  0.9696695610826327|
|  0.7959575617927873|
|  0.4484250584033179|
|  0.6793959570375868|
|  0.3724113862805264|
|   0.832609472539921|
|  0.7479557402720448|
|  0.7216183163402288|
|0.016051221049720343|
|  0.6307120027798567|
|    0.07537082371587|
|   0.838930558220017|
+--------------------+
only showing top 20 rows



In [75]:
from pyspark.sql.functions import rand, ntile

In [74]:
WindowSpec = Window.orderBy("values")
df.withColumn("bin", ntile(10).over(WindowSpec)).show()

+--------------------+---+
|              values|bin|
+--------------------+---+
|0.005039492476539786|  1|
|0.010815371607771573|  1|
|0.016051221049720343|  1|
|0.054599328832547256|  1|
| 0.06476522637900317|  1|
| 0.06993233728279169|  1|
|  0.0733222570891463|  1|
| 0.07531261247891552|  1|
|    0.07537082371587|  1|
| 0.07705198481765851|  1|
| 0.09793090948613814|  2|
| 0.11388471205143302|  2|
| 0.12847898035637806|  2|
| 0.13800057515473785|  2|
| 0.16191945645714856|  2|
|  0.1663672731939081|  2|
|  0.1732080573424314|  2|
|  0.1882393785257075|  2|
| 0.20593872923627632|  2|
|  0.2073428376111074|  2|
+--------------------+---+
only showing top 20 rows

