# Regex
- Docs <a href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.regexp_extract.html"> [link]</a>
- Regex patterns follow Java<br>
- W3school, Java Regoular Expressions<a href="https://www.w3schools.com/java/java_regex.asp"> [link]</a>

### Material for studying

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Regex") \
    .getOrCreate()


columns = ['id', 'text']
data = [(1, 'I love you 3000. now It turned into 2000'), (2, 'Hello, World'), (3, 'Every day NEW life!'),
        (4, '@HAPPY #SPARK'), (5, '#I don\'t know you'), (6, '@2000'), (7, '@2000:'), (8, '2@2000')]

df = spark.sparkContext.parallelize(data).toDF(columns)
df.show()

+---+--------------------+
| id|                text|
+---+--------------------+
|  1|I love you 3000. ...|
|  2|        Hello, World|
|  3| Every day NEW life!|
|  4|       @HAPPY #SPARK|
|  5|   #I don't know you|
|  6|               @2000|
|  7|              @2000:|
|  8|              2@2000|
+---+--------------------+



### Test as much as you want!

In [2]:
from pyspark.sql.functions import regexp_extract

In [3]:
# Regex pattern should be wrapped in '(' and ')'
df.select('id', regexp_extract('text', '(\d+)', 1).alias('d')).collect()

[Row(id=1, d='3000'),
 Row(id=2, d=''),
 Row(id=3, d=''),
 Row(id=4, d=''),
 Row(id=5, d=''),
 Row(id=6, d='2000'),
 Row(id=7, d='2000'),
 Row(id=8, d='2')]

In [4]:
# "+" is expanding retrieving alphabet until the space. (compared it to above one)
df.select('id', regexp_extract('text', '(\d)', 1).alias('d')).collect()

[Row(id=1, d='3'),
 Row(id=2, d=''),
 Row(id=3, d=''),
 Row(id=4, d=''),
 Row(id=5, d=''),
 Row(id=6, d='2'),
 Row(id=7, d='2'),
 Row(id=8, d='2')]

In [5]:
# This pattern matches any character except a ~ z
df.select('id', regexp_extract('text', '([a-z]+)', 1).alias('d')).collect()

[Row(id=1, d='love'),
 Row(id=2, d='ello'),
 Row(id=3, d='very'),
 Row(id=4, d=''),
 Row(id=5, d='don'),
 Row(id=6, d=''),
 Row(id=7, d=''),
 Row(id=8, d='')]

In [6]:
# This pattern matches any character except #, a, b or c.
df.select('id', regexp_extract('text', '([^#abc])', 1).alias('d')).collect()

[Row(id=1, d='I'),
 Row(id=2, d='H'),
 Row(id=3, d='E'),
 Row(id=4, d='@'),
 Row(id=5, d='I'),
 Row(id=6, d='@'),
 Row(id=7, d='@'),
 Row(id=8, d='2')]

In [7]:
# Meaning of "^Hello" starts with "Hello"
df.select('id', regexp_extract('text', '(^@[a-z])', 1).alias('d')).collect()

[Row(id=1, d=''),
 Row(id=2, d=''),
 Row(id=3, d=''),
 Row(id=4, d=''),
 Row(id=5, d=''),
 Row(id=6, d=''),
 Row(id=7, d=''),
 Row(id=8, d='')]

In [8]:
# \w: A word character, short for [a-zA-Z_0-9] 
df.select('id', regexp_extract('text', '(^@\w+)', 1).alias('d')).collect()

[Row(id=1, d=''),
 Row(id=2, d=''),
 Row(id=3, d=''),
 Row(id=4, d='@HAPPY'),
 Row(id=5, d=''),
 Row(id=6, d='@2000'),
 Row(id=7, d='@2000'),
 Row(id=8, d='')]

In [9]:
# Start with "@" and if it exists following "word" but except ":"
df.select('id', regexp_extract('text', '(^@\w+)[:]', 1).alias('d')).collect()

[Row(id=1, d=''),
 Row(id=2, d=''),
 Row(id=3, d=''),
 Row(id=4, d=''),
 Row(id=5, d=''),
 Row(id=6, d=''),
 Row(id=7, d='@2000'),
 Row(id=8, d='')]