In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("RegexExample").getOrCreate()
data = [("apple",), ("banana",), ("apricot",), ("grape",)]
df = spark.createDataFrame(data, ["fruit"])

# Filter rows where 'fruit' column contains 'app'
filtered_df = df.filter(F.col("fruit").rlike("ap"))
filtered_df.show()

+-------+
|  fruit|
+-------+
|  apple|
|apricot|
|  grape|
+-------+



In [3]:
from pyspark.sql import functions as F

data = [("id_123_name_john",), ("id_456_name_jane",)]
df = spark.createDataFrame(data, ["info"])

# Extract the ID number
extracted_df = df.withColumn("id_number", F.regexp_extract(F.col("info"), r"id_(\d+)", 1))
extracted_df.show()

+----------------+---------+
|            info|id_number|
+----------------+---------+
|id_123_name_john|      123|
|id_456_name_jane|      456|
+----------------+---------+



regexp_replace(str, pattern, replacement):  
* Replaces all occurrences of a substring that matches a regular expression pattern within a string column with a specified replacement string.
* str: The column to apply the replacement to.
* pattern: The regular expression pattern to search for.
* replacement: The string to replace the matched patterns with.

In [4]:
from pyspark.sql import functions as F

data = [("hello world",), ("pyspark dataframe",)]
df = spark.createDataFrame(data, ["text"])
df.show()
# Replace spaces with underscores
replaced_df = df.withColumn("cleaned_text", F.regexp_replace(F.col("text"), r"\s", "_"))
replaced_df.show()

+-----------------+
|             text|
+-----------------+
|      hello world|
|pyspark dataframe|
+-----------------+

+-----------------+-----------------+
|             text|     cleaned_text|
+-----------------+-----------------+
|      hello world|      hello_world|
|pyspark dataframe|pyspark_dataframe|
+-----------------+-----------------+



pyspark.sql.functions.regexp_replace(string, pattern, replacement)  
**Parameters:**
1. **string:** This is the column or string literal containing the text where replacements will occur. It can be a Column object or a Python string.
2. **pattern:** This is the regular expression pattern to search for within the string. It can be a Column object or a Python string representing the regex.
replacement: This is the string or column containing the value that will replace the matched pattern. It can be a Column object or a Python string.
3. **replacement:** This is the string or column containing the value that will replace the matched pattern. It can be a Column object or a Python string.

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("RegexpReplaceExample").getOrCreate()

data = [("abc123def456",), ("hello789world",)]
df = spark.createDataFrame(data, ["text"])

# Replace all digits with a hyphen
df_replaced = df.withColumn("cleaned_text", F.regexp_replace(F.col("text"), r"\d+", "-"))

df_replaced.show()

25/11/15 21:44:49 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+-------------+------------+
|         text|cleaned_text|
+-------------+------------+
| abc123def456|    abc-def-|
|hello789world| hello-world|
+-------------+------------+



In [16]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("RegexpReplaceExample").getOrCreate()

data = [("__abc123def456",), ("hello789__world",)]
df = spark.createDataFrame(data, ["text"])

# Replace all digits with a hyphen
df_replaced = df.withColumn("cleaned_text", F.regexp_replace(F.col("text"), r"_+", ""))

df_replaced.show()

+-------------+
|         text|
+-------------+
| abc123def456|
|hello789world|
+-------------+

