# More Regular Expression
If you want to type along with me, use [this notebook](https://humboldt.cloudbank.2i2c.cloud/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fbethanyj0%2Fdata271_sp24&branch=main&urlpath=tree%2Fdata271_sp24%2Fdemos%2Fdata271_demo7_live.ipynb) instead. 
If you don't want to type and want to follow along just by executing the cells, stay in this notebook. 

In [None]:
import re 

## Questions from last time

In [None]:
# How to match the last occurance?
text = "I'm not lazy, I'm just on energy-saving mode. It's the best mode. - Unknown"
re.search('mode',text)

In [None]:
#re.finditer allows you to iterate through the match objects
[match for match in re.finditer('mode',text)]

In [None]:
# Can you split by a character without losing that character? Positive look behind (we'll talk more about this) 
re.split('(?<=\.)',text)

In [None]:
# Can we sub just the second occurance?
re.sub('mode','zone',text,count=1)

In [None]:
# Couldn't find a great solution to this. One I thought of...
text[:re.search('mode',text).start()+1] + re.sub('mode','zone',text[re.search('mode',text).start()+1:],count=1)
# Brownie points to anyone who thinks of a better solution

In [None]:
# How to handle characters with accents?
str_with_accent = 'cómo'
re.findall('[a-z]',str_with_accent)

In [None]:
# Can use unicode characters
re.findall('[a-z\u00C0-\u017F]',str_with_accent)

In [None]:
# Or can type them in
re.findall('[a-zá-ź]',str_with_accent)

In [None]:
# Or \w will capture them
re.findall('\w',str_with_accent)

## Repetitions

In [None]:
string = "She sells seashells by the seashore."

In [None]:
# get patterns that start with s and end with a with any character in between
re.findall('s.a',string)

In [None]:
# get patterns that start with s and end with s with any character in between
re.findall('s.s',string)

In [None]:
# get patterns that start with s and end with s with any 2 characters in between
re.findall('s.{2}s',string)

In [None]:
# another way
re.findall('s..s',string)

In [None]:
# three characters in between
re.findall('s.{3}s',string)

In [None]:
# between 1 and 4 characters between s and s
re.findall('s.{1,4}s',string)

In [None]:
# between 1 and 4 characters between s and s
re.findall('s.{1,4}s',string[string.find('s')+1:])

In [None]:
# between 1 and 4 characters between s and s (lazy)
re.findall('s.{1,4}?s',string[string.find('s')+1:])

In [None]:
# between 1 and 4 characters (except s) between s and s; use carat inside []
re.findall('s[^s]{1,4}s',string[string.find('s')+1:])

In [None]:
# between 1 and 4 characters between s and s (not including the last s) POSITIVE LOOK AHEAD
re.findall('s.{1,4}(?=s)',string)

In [None]:
# between 1 and 4 characters between s and s (not including the last s) lazy
re.findall('s.{1,4}?(?=s)',string)

In [None]:
# 2 or more characters between s and s (greedy)
re.findall('s.{2,}s',string)

In [None]:
# between 2 or more characters between
re.findall('s.{2,}?s',string)

## Grouping 

In [None]:
text = 'apple banana appleappleapple applee orange'

In [None]:
# + only matches characters immediately to the left
re.findall('apple+',text)

In [None]:
# If we want to match a whole word, we group
re.findall('(apple)+',text)

In [None]:
# note that it only returns the things in the group
email = 'myusername123@email.com'
re.findall('([a-z0-9]+)@',email)

In [None]:
# another example: using grouping for collections of info
statement = 'Mary has 3 cats. Ben had 2 dogs. Maya has 14 chickens, and April has 1 alpaca.'

In [None]:
# get all the statements in the form "person has or had # pets"
re.findall('[A-Za-z]+\s[A-Za-z]+\s\d+\s[A-Za-z]+',statement)

In [None]:
# if I only care about the people, group by the first part
re.findall('([A-Za-z]+)\s[A-Za-z]+\s\d+\s[A-Za-z]+',statement)

In [None]:
# if I care about the people and the number of pets they have or had
re.findall('([A-Za-z]+)\s[A-Za-z]+\s(\d+)\s[A-Za-z]+',statement)

In [None]:
# if I care about the people and the number of pets and the type of pet
re.findall('([A-Za-z]+)\s[A-Za-z]+\s(\d+)\s([A-Za-z]+)',statement)

In [None]:
important_info = re.findall('([A-Za-z]+)\s[A-Za-z]+\s(\d+)\s([A-Za-z]+)',statement)
names = [i[0] for i in important_info]
names

In [None]:
pets = [i[2] for i in important_info]
pets

## Searching for special characters

In [None]:
question = "What is the meaning of life?"

In [None]:
# To search for specific special characters
re.findall('?',question)

In [None]:
# We have to escape
re.findall('\?',question)

In [None]:
# We don't have to escape if it's inside of a set
re.findall('[li?]',question)

## Raw strings

In [None]:
# A string in python
print('\tTab')

In [None]:
# A raw string in python
print(r'\tTab')

In [None]:
print('Line1\nLine2')

In [None]:
print(r'Line1\nLine2')

In [None]:
example_file_path = r"C:\users\blah.txt"

In [None]:
re.findall(r"users\\",example_file_path)

In [None]:
re.findall('users\\\\',example_file_path)

## Activities

1. Extract all the words that contain two or more vowels in a row from the quote below. 

In [None]:
quote = "The best and most beautiful things in the world cannot be seen or even touched - they must be felt with the heart. -Helen Keller"

2. Extract all the emoticons like `:)` or `:-)` etc. 

In [None]:
greeting = "Hi! :D It is so nice to meet you! :-) I wish I could stay and chat :P but I have to go. :( See ya. D,:"

3. Extract the year, month, and day for each date in the list.

In [None]:
dates = ['01-01-2001','02-02-2002','03-03-2003','04-04-2004','05-05-2005','06-06-2006',
         '07-07-2007','08-08-2008','09-09-2009','10-10-2010','11-11-2011','12-12-2012']

year = ...
month = ...
day = ...