# Regular Expressions Exercises

In [1]:
# import regex and pandas
import re
import pandas as pd

### Exercise I
Write a function named is_vowel. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.

In [2]:
def is_vowel(string):
    
    '''
    This function returns bool True if re determines the passed string is a single vowel.
    '''
    
    return bool(re.search(r'^[aeiou]{1}$', string, re.IGNORECASE))

In [3]:
# test function
if __name__ == '__main__':
    print(is_vowel('n'))
    print(is_vowel('E'))
    print(is_vowel('EEEEEEeeeee'))

False
True
False


### Exercise II
Write a function named is_valid_username that accepts a string as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character. It should also be no longer than 32 characters. The function should return either True or False depending on whether the passed string is a valid username.


* is_valid_username('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')
    * False
*  is_valid_username('codeup')
    * True
*  is_valid_username('Codeup')
    * False
*  is_valid_username('codeup123')
    * True
*  is_valid_username('1codeup')
    * False

In [4]:
def is_valid_username(string):
    
    '''
    This function returns bool True if re determines an eligible match found for something that starts with any lower
    case letter, and any alphanumeric character or underscore for the next 31 characters.
    ''' 
     
    return bool(re.search(r'^[a-z]\w{,31}$', string))

In [5]:
# test functionality

if __name__ == '__main__':
    print(is_valid_username('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'))
    print(is_valid_username('codeup'))
    print(is_valid_username('Codeup'))
    print(is_valid_username('codeup123'))
    print(is_valid_username('1codeup'))

False
True
False
True
False


### Exercise III
Write a regular expression to capture phone numbers. It should match all of the following:

* (210) 867 5309
* +1 210.867.5309
* 867-5309
* 210-867-5309

In [6]:
def return_phone_number(string):
    
    '''
    This function accepts a string, a phone number, and returns the string of the phone number (and more)
    if re determines that it is a phone number.
    '''
    
    return re.search(r'.?\d{3}.?\d{4}|.*?\d{3}.*?\d{3}.*?\d{4}', string)

In [7]:
# test functionality
if __name__ == '__main__':
    print(return_phone_number('(210) 867 5309'))
    print(return_phone_number('+1 210.867.5309'))
    print(return_phone_number('867-5309'))
    print(return_phone_number('210-867-5309'))

<re.Match object; span=(0, 14), match='(210) 867 5309'>
<re.Match object; span=(0, 15), match='+1 210.867.5309'>
<re.Match object; span=(0, 8), match='867-5309'>
<re.Match object; span=(0, 12), match='210-867-5309'>


### Alternate method using both pandas and regex provided by Orsinger

In [8]:
# design phone search algorithm
phone_search = re.compile(
'''
^(?P<country_code>\+\d+)?
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<exchange_code>\d{3})
\D*?
(?P<line_number>\d{4})
''', re.VERBOSE)

'''
This algorithm searches for numbers that may (optional) start with a country code; it searches for and 
identifies a country code by searching for a literal "+" (\+) sign and then one or more (+) of any digit (\d). 
Optional (?). It then searches for anything that isn't a digit (\D) immediately after searching for area code 
(white-space odds). Optional (?). It then searches for an area code by looking for {3} digits (\d). Optional (?).  
It then checks for optional white space again. It then searches for an exchange code, {3} digits (\d). It then 
checks for optional white space again. It then searches for a line_number of {4} digits (\d). It then declares this code has freedom of speech, and should not be taken ligthly. End journal.
'''

df = pd.DataFrame()    # create empty dataframe
df['number'] = [    # assign number column to baby dataframe
    '(210) 867 5309',
    '+1 210.867.5309',
    '867-5309',
    '210-867-5309',
    '2108675309',
]

df   # watch baby dataframe grow up

Unnamed: 0,number
0,(210) 867 5309
1,+1 210.867.5309
2,867-5309
3,210-867-5309
4,2108675309


In [9]:
# extract text from number column using defined phone search algorithm above and concatenate with baby df
df = pd.concat([df, df.number.str.extract(phone_search)], axis = 1)

# watch dataframe succeed in life
df

Unnamed: 0,number,country_code,area_code,exchange_code,line_number
0,(210) 867 5309,,210.0,867,5309
1,+1 210.867.5309,1.0,210.0,867,5309
2,867-5309,,,867,5309
3,210-867-5309,,210.0,867,5309
4,2108675309,,210.0,867,5309


### Exercise IV
Use regular expressions to convert the dates below to the standardized year-month-day format.

* 02/04/19
* 02/05/19
* 02/06/19
* 02/07/19
* 02/08/19
* 02/09/19
* 02/10/19

In [10]:
# assign variable to string for testing
string = '02/04/19'

# find all instances of 2 digits
re.findall(r'\d{2}', string)

['02', '04', '19']

In [11]:
# split string on '/'
re.split(r'/', string)

['02', '04', '19']

In [12]:
# substitute 4 digit years for 2 digit years
re.sub(r'\d{2}$', '2019', string)

'02/04/2019'

In [13]:
new_dates = []
dates = ['02/04/19', '02/05/19', '02/06/19', '02/07/19', '02/08/19', '02/09/19', '02/10/19']
for string in dates:
    new_dates.append(re.sub(r'\d{2}$', '2019', string))

new_dates

['02/04/2019',
 '02/05/2019',
 '02/06/2019',
 '02/07/2019',
 '02/08/2019',
 '02/09/2019',
 '02/10/2019']

In [14]:
late_dates = pd.DataFrame(dates)
late_dates['new_date'] = new_dates
late_dates.columns = ['original', 'new_date']
late_dates

Unnamed: 0,original,new_date
0,02/04/19,02/04/2019
1,02/05/19,02/05/2019
2,02/06/19,02/06/2019
3,02/07/19,02/07/2019
4,02/08/19,02/08/2019
5,02/09/19,02/09/2019
6,02/10/19,02/10/2019


In [15]:
date_search = re.compile(
r'''
(?P<month>\d{2})/
(?P<day>\d{2})/
(?P<year>\d{4})
''', re.VERBOSE)

dates = pd.concat([late_dates, late_dates.new_date.str.extract(date_search)], axis = 1)
dates['standardized_format'] = dates.year + '-' + dates.month + '-' + dates.day
dates

Unnamed: 0,original,new_date,month,day,year,standardized_format
0,02/04/19,02/04/2019,2,4,2019,2019-02-04
1,02/05/19,02/05/2019,2,5,2019,2019-02-05
2,02/06/19,02/06/2019,2,6,2019,2019-02-06
3,02/07/19,02/07/2019,2,7,2019,2019-02-07
4,02/08/19,02/08/2019,2,8,2019,2019-02-08
5,02/09/19,02/09/2019,2,9,2019,2019-02-09
6,02/10/19,02/10/2019,2,10,2019,2019-02-10


### Exercise V
Write a regex to extract the various parts of these logfile lines:


GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58

POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58

GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58

In [16]:
# define list 'lines' with logfile line data 
lines = [
    """GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58""",
    """POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58""",
    """GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58"""
]

lines    # view lines

['GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58',
 'POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58',
 'GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58']

In [17]:
# courtesy of the mythic Orsinger
log_search = re.compile(r'''
(?P<method>GET|POST)
\s
(?P<path>[/\w\-\?=]+)
\s
\[(?P<timestamp>.+)\]
\s
(?P<http_version>HTTP/\d+\.\d+)
\s
\{(?P<status_code>\d+)\}
\s
(?P<bytes>\d+)
\s
"(?P<user_agent>.+)"
\s
(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.d{1,3})
$
''', re.VERBOSE)
                        

In [18]:
rows = [re.search(log_search, line).groupdict() for line in lines]
rows

AttributeError: 'NoneType' object has no attribute 'groupdict'

### Bonus Exercise

You can find a list of words on your mac at /usr/share/dict/words. Use this file to answer the following questions:


- How many words have at least 3 vowels?
- How many words have at least 3 vowels in a row?
- How many words have at least 4 consonants in a row?
- How many words start and end with the same letter?
- How many words start and end with a vowel?
- How many words contain the same letter 3 times in a row?
- What other interesting patterns in words can you find?