# Intro to Regular Expression
If you want to type along with me, use [this notebook](https://humboldt.cloudbank.2i2c.cloud/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fbethanyj0%2Fdata271_sp25&branch=main&urlpath=tree%2Fdata271_sp25%2Flectures%2Fdata271_lec05_live.ipynb) instead. 
If you don't want to type and want to follow along just by executing the cells, stay in this notebook. 

In [None]:
import re 

## Regular Expression Functions in `re`

In [None]:
text =  """Knock, Knock. 
Who's there? 
Utah. 
Utah who? 
Utah-king to me?"""

In [None]:
# search (don't usually use this too much unless working with large texts just to know if something is there)
re.search('Utah',text)

In [None]:
print(re.search('271',text))

In [None]:
# findall catches all occurences and puts them into a list
re.findall('Utah',text)

In [None]:
# Using len with findall allows us to count occurances
len(re.findall('Utah',text))

In [None]:
# split puts things in a list split by your input. NOTE mode doesn't necessarily make sense to split by
re.split('Utah',text)

In [None]:
# to split by line
re.split('\n',text)

In [None]:
# sub replaces all occurances
re.sub('Knock',"\U0001F44A",text)

In [None]:
# you can replace just one if needed
re.sub('Knock',"\U0001F44A",text,count=1)

## Character classes

In [None]:
joke = "Knock Knock Who's there? 2:30 2:30 who? I made a dentist appointment cause my 2:30 (tooth hurty)"

In [None]:
# To find the digits
re.findall('\d',joke)

In [None]:
# To get everything that isn't a number
re.findall('\D',joke)

In [None]:
# Something similar
# doesn't include the special characters but does include letters and numbers
re.findall('\w',joke) 

In [None]:
# To get just the special characters
re.findall('\W',joke)

In [None]:
# To just get the white spaces
re.findall('\s',joke)

In [None]:
# To get everything but white spaces
re.findall('\S',joke)

## Sets

In [None]:
string = "I like going to Cal Poly Humboldt, and DATA 271 is a fun class."

In [None]:
# gets characters, C, a, or l, but not 'Cal'
re.findall('[Cal]',string)

In [None]:
# get the lowercase characters between a and m
re.findall('[a-m]',string) 

In [None]:
# get the lowercase and uppercase characters between a and z
re.findall('[a-zA-z]',string) 

In [None]:
# get the lowercase and uppercase characters between a and z and numbers
re.findall('[a-zA-z0-9]',string)

In [None]:
# just the numbers 0 through 5
re.findall('[0-5]',string)

In [None]:
# Get every single character
re.findall('.',string)

## Special characters: Specifying number of occurences

- `*` zero or more
- `+` 1 or more
- `?` zero or one

In [None]:
tongue_twister = 'How much wood would a woodchuck chuck if a woodchuck could chuck wood?'

In [None]:
# Match wo and then any character zero or more times (greedy)
re.findall('wo.*',tongue_twister)

In [None]:
# Match wo and then any character zero more times followed by i (greedy)
re.findall('wo.*i',tongue_twister)

In [None]:
# Match wo and then any character 1 or more times followed by d (greedy)
re.findall('wo.+d',tongue_twister)

In [None]:
# Match wo and then any character 1 or more times followed by d (not greedy)
re.findall('wo.+?d',tongue_twister)

In [None]:
# Match w o 0 or more times followed by d
re.findall('wo*d',tongue_twister)

In [None]:
# Match woo and then any character 0 or 1 or zero times followed by d
re.findall('woo.?d',tongue_twister)

In [None]:
# Match w then o 0 or more times then u 0 or 1 times then l 0 or 1 times then d
re.findall('wo*u?l?d',tongue_twister)

In [None]:
re.findall('wood|would',tongue_twister)

In [None]:
# special characters can't be searched for directly. Can't search for question marks like this
re.findall('?',tongue_twister)

In [None]:
# We need to escape
re.findall('\?',tongue_twister)

In [None]:
# Search start of the string
re.findall('^wood',tongue_twister)

In [None]:
# Search start of the string
re.findall('^How',tongue_twister)