# Regex

In [100]:
import pandas as pd
import re

## Exercise 1

Write a function named `is_vowel`. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of `re.search` as a boolean value that indicates whether or not the regular expression matches the given string.

In [101]:
def is_vowel(subject):
    '''
    This function takes the first letter of a string and returns True if that string is a vowel.
    '''
    regexp = r'[AEUOaeiou]'
    
    if re.match(regexp, subject):
        return True
    else:
        return False

In [83]:
is_vowel('a')

True

In [84]:
is_vowel('b')

False

## Exercise 2 

Write a function named `is_valid_username` that accepts a string as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character. It should also be no longer than 32 characters. The function should return either `True` or `False` depending on whether the passed string is a valid username.

In [139]:
def is_valid_username(username):
    '''
    This function verifies whether or not a username is in the valid format.
    '''
    #Note that \w indicates any alphanumeric character, including the underscore _
    rexexp = '^[a-z]\w{1,31}$'
    
    if re.match(regexp, username):
        return True
    else:
        return False

In [140]:
#Begins with an underscore
is_valid_username("_username")

False

In [141]:
#Capital letter
is_valid_username("MY_username")

False

In [142]:
#Begins with number
is_valid_username("10dollars")

False

In [143]:
#Invalid character
is_valid_username("dollars@")

False

In [144]:
#Too long
is_valid_username("ThisismyusernameitstoolongbutIlikelongthings")

False

In [145]:
#Valid 
is_valid_username("user_99_")

True

## Exercise 3

Write a regular expression to capture phone numbers. It should match all of the following:


- (210) 867 5309
- +1 210.867.5309
- 867-5309
- 210-867-5309

In [185]:
regexp = r'''
(?P<country>\+\d+)?
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<exchange_code>\d{3})
\D*?
(?P<subscriber_number>\d{4})
'''

In [186]:
match = re.search(regexp, "(210) 867 5309", re.VERBOSE)
match.groupdict()

{'country': None,
 'area_code': '210',
 'exchange_code': '867',
 'subscriber_number': '5309'}

In [187]:
match = re.search(regexp, "+1 210.867.5309", re.VERBOSE)
match.groupdict()

{'country': '+1',
 'area_code': '210',
 'exchange_code': '867',
 'subscriber_number': '5309'}

In [188]:
match = re.search(regexp, "867-5309", re.VERBOSE)
match.groupdict()

{'country': None,
 'area_code': None,
 'exchange_code': '867',
 'subscriber_number': '5309'}

In [189]:
match = re.search(regexp, "210-867-5309", re.VERBOSE)
match.groupdict()

{'country': None,
 'area_code': '210',
 'exchange_code': '867',
 'subscriber_number': '5309'}

In [269]:
regexp = r'(\+\d+)?\D*?(\d{3})?\D*?(\d{3})\D*?(\d{4})'

In [270]:
sentence = '''
Here are some phone number formats: (210) 867 5309, +1 210.867.5309, 867-5309, and, of course, 210-867-5309. 
'''.strip()

In [271]:
match = re.findall(regexp, sentence)
match

[('', '210', '867', '5309'),
 ('+1', '210', '867', '5309'),
 ('', '', '867', '5309'),
 ('', '210', '867', '5309')]

## Exercise 4 

Use regular expressions to convert the dates below to the standardized year-month-day format.

- 02/04/19
- 02/05/19
- 02/06/19
- 02/07/19
- 02/08/19
- 02/09/19
- 02/10/19

In [190]:
regexp = r'''
(\d{1,2})/(\d{1,2})/(\d{2})
'''

In [200]:
def convert_to_year_month_day(date): 
    return re.sub(r'(\d{1,2})/(\d{1,2})/(\d{2})', '20\\3-\\1-\\2', date)

In [201]:
convert_to_year_month_day("02/04/19")

'2019-02-04'

In [202]:
convert_to_year_month_day("02/05/19")

'2019-02-05'

## Exercise 5

Write a regex to extract the various parts of these logfile lines:

In [131]:
lines = '''
GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58
'''

In [6]:
logfile1 = '''
GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
'''

In [7]:
logfile2 = '''
POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
'''

In [8]:
logfile3 = '''
GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58
'''

In [127]:
regexp = r'''
(?P<action>[A-Z]+)
\s
(?P<path>.*)?
\s
(?P<timestamp>.*)?
\s
(?P<hypertext_version>.*)?
\s
(?P<status_code>.*)?
\s
(?P<bytes>.*)?
\s
"(?P<request>.*)"
\s
(?P<ip_address>.*)
'''

In [128]:
match1 = re.search(regexp, logfile1, re.VERBOSE)
match1.groupdict()

{'action': 'GET',
 'path': '/api/v1/sales?page=86',
 'timestamp': '[16/Apr/2019:193452+0000]',
 'hypertext_version': 'HTTP/1.1',
 'status_code': '{200}',
 'bytes': '510348',
 'request': 'python-requests/2.21.0',
 'ip_address': '97.105.19.58'}

In [129]:
match2 = re.search(regexp, logfile2, re.VERBOSE)
match2.groupdict()

{'action': 'POST',
 'path': '/users_accounts/file-upload',
 'timestamp': '[16/Apr/2019:193452+0000]',
 'hypertext_version': 'HTTP/1.1',
 'status_code': '{201}',
 'bytes': '42',
 'request': 'User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
 'ip_address': '97.105.19.58'}

In [130]:
match3 = re.search(regexp, logfile3, re.VERBOSE)
match3.groupdict()

{'action': 'GET',
 'path': '/api/v1/items?page=3',
 'timestamp': '[16/Apr/2019:193453+0000]',
 'hypertext_version': 'HTTP/1.1',
 'status_code': '{429}',
 'bytes': '3561',
 'request': 'python-requests/2.21.0',
 'ip_address': '97.105.19.58'}

In [139]:
logfile_df = pd.DataFrame([re.search(regexp, line, re.VERBOSE).groupdict() for line in lines.strip().split('\n')])
logfile_df

Unnamed: 0,action,path,timestamp,hypertext_version,status_code,bytes,request,ip_address
0,GET,/api/v1/sales?page=86,[16/Apr/2019:193452+0000],HTTP/1.1,{200},510348,python-requests/2.21.0,97.105.19.58
1,POST,/users_accounts/file-upload,[16/Apr/2019:193452+0000],HTTP/1.1,{201},42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET,/api/v1/items?page=3,[16/Apr/2019:193453+0000],HTTP/1.1,{429},3561,python-requests/2.21.0,97.105.19.58


In [140]:
logfile_df.T

Unnamed: 0,0,1,2
action,GET,POST,GET
path,/api/v1/sales?page=86,/users_accounts/file-upload,/api/v1/items?page=3
timestamp,[16/Apr/2019:193452+0000],[16/Apr/2019:193452+0000],[16/Apr/2019:193453+0000]
hypertext_version,HTTP/1.1,HTTP/1.1,HTTP/1.1
status_code,{200},{201},{429}
bytes,510348,42,3561
request,python-requests/2.21.0,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,python-requests/2.21.0
ip_address,97.105.19.58,97.105.19.58,97.105.19.58


## Bonus 

You can find a list of words on your mac at /usr/share/dict/words. Use this file to answer the following questions:


- How many words have at least 3 vowels?
- How many words have at least 3 vowels in a row?
- How many words have at least 4 consonants in a row?
- How many words start and end with the same letter?
- How many words start and end with a vowel?
- How many words contain the same letter 3 times in a row?
- What other interesting patterns in words can you find?

In [54]:
words = pd.read_csv("/usr/share/dict/words")

In [60]:
words.head()

Unnamed: 0,A
0,a
1,aa
2,aal
3,aalii
4,aam


In [62]:
word_list = words.A.to_list()

In [66]:
word_list[0:20]

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'Aani',
 'aardvark',
 'aardwolf',
 'Aaron',
 'Aaronic',
 'Aaronical',
 'Aaronite',
 'Aaronitic',
 'Aaru',
 'Ab',
 'aba',
 'Ababdeh',
 'Ababua',
 'abac',
 'abaca']

In [108]:
three_vowels = [ ]
for word in word_list:
    rx = r"\b(?:\w*[aeiyouAEIYOU]){3,}\w*"
    if re.findall(rx, str(word)):
        three_vowels.append(word)

In [109]:
print("There are", len(three_vowels), "words with three vowels in our dictionary")

There are 200494 words with three vowels in our dictionary


In [110]:
three_vowels_in_a_row = [ ]
for word in word_list:
    rx = r"\b(?=[a-z]*[AEIOUaeiou]{3,})[a-z]+\b"
    if re.findall(rx, str(word)):
        three_vowels_in_a_row.append(word)

In [113]:
print("There are", len(three_vowels_in_a_row), "words with three or more consecutive vowels in our dictionary")

There are 4761 words with three or more consecutive vowels in our dictionary


In [114]:
four_consonants_in_a_row = [ ]
for word in word_list:
    rx = r"\b(?=[a-z]*[^AEIOUaeiou]{4,})[a-z]+\b"
    if re.findall(rx, str(word)):
        four_consonants_in_a_row.append(word)

In [117]:
print("There are", len(four_consonants_in_a_row), "words with four or more consecutive consonants in our dictionary")

There are 17512 words with four or more consecutive consonants in our dictionary


In [120]:
same_letter_start_end = [ ]
for word in word_list:
    rx = r'^[a-z]$|^([a-z]).*\1$'
    if re.findall(rx, str(word)):
        same_letter_start_end.append(word)

In [122]:
print("There are", len(same_letter_start_end ), "words that start and end with the same letter in our dictionary.")

There are 9944 words that start and end with the same letter in our dictionary.


In [124]:
vowel_start_end = [ ]
for word in word_list:
    rx = r'^[AEIOUaeiou]\w*[AEIOUaeiou]$'
    if re.findall(rx, str(word)):
        vowel_start_end.append(word)

In [126]:
print("There are", len(vowel_start_end), "words that start and end with a vowel in our dictionary.")

There are 14657 words that start and end with a vowel in our dictionary.
