### ``Exercises: REGULAR EXPRESSION ("regex")``

    29AUGUST2022

----

In [64]:
# notebook dependencies 
import pandas as pd
import numpy as np

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# regular expression import
import re

### ``Exercises Number 1: Write a function named is_vowel``

**It should accept a string as input and use a regular expression to determine if the passed string is a vowel.** 

*While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.*

In [65]:
# function created to check if string/character is a vowel
# if string/character is vowel, function returns True
# if string/character != vowel, function returns False

vowel = ["a", "e", "i", "o", "u"]

def is_vowel(x):
    
    search_result = re.search(r"^[aeiou]{1}$", x, re.IGNORECASE)

    if search_result != None:

        return True

    else:

        return False

In [66]:
# checking the function

is_vowel("p")

False

In [67]:
# checking the function

is_vowel("o")

True

----

### ``Exercise Number 2: Write a function named is_valid_username that accepts a string as input.``

**A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character.**

- It should also be no longer than 32 characters.
- The function should return either True or False depending on whether the passed string is a valid username.

In [68]:
# function to check if username is valid
# username must be equal to, or less than 32 characters
# username cannot contain uppercase letters 
# username cannot contain non-alphanumeric characters (e.g., @, #, %, etc.)

def is_valid_username(username):
    
    # verifying the username is 32 or less characters
    if len(username) <= 32:

        '''RegEx parses thru text/username, finds any character
        not in expression: [a-z0-9_]. Any found characters are saved to a list variable.
        If variable list is empty then username is valid.'''

        search_result = re.search(r'^[a-z][a-z0-9_]$', username)

        if search_result != None:

            return print("Valid Username.")

        else:

            return print("Invalid username. Try again.")
    else:
        print("Invalid username. Length of username must be <= 32 characters. Try again.")

In [69]:
# checking the function -- username should be correct!

is_valid_username("baseball_12") # checks out!

Invalid username. Try again.


In [70]:
# checking the function -- where the "B" is capitalized

is_valid_username("Baseball_12") # checks out!

Invalid username. Try again.


In [71]:
# checking the function -- where there is a "@" in the username

is_valid_username("bas@seball_12") # checks out!

Invalid username. Try again.


----
### ``Exercise Number 3: Write a regular expression to capture phone numbers. It should match all of the following:``

**<u>Sample Phone Numbers:</u>**

* (210) 867 5309
* +1 210.867.5309
* 867-5309
* 210-867-5309

In [72]:
# let's create a list of phone numbers to iterate through

numbers = [
        '(210) 867 5309', \
        '+1 210.867.5309', \
        '867-5309', \
        '210-867-5309', \
        '678!09', \
        '+19, (222)'
]

def is_phone_number(phone_number):
    
    search_result = re.search(r'''
    # where first line is expressing: check if string begins with "+", if so - check the 0-3 characters characters that come after it (optional)
        (^\+?\d{0,3})

        # check if following characters are present (optional)
                ?\s?(\(?[0-9]{3}\)?)

                # check if following characters are present (optional)
                    ?.?[0-9]{3}.

                    # check if following characters are present (optional)
                        ?[0-9]{4}''', phone_number, re.VERBOSE)

    # if re.search object is not empty, then it is a valid phone number, otherwise...it's not a valid number
    if search_result != None:

        return print(f'[{phone_number}].....VALID phone number.')

    else: return print(f'[{phone_number}].....NOT VALID phone number.')


# note to self:
# I'm not really sure why most/all of these "chuncks are optional"
# if all chuncks are optional then what is the purpose of expressing the syntax for phone numbers/needed results?

In [73]:
# checking the function with loop through numbers

for number in numbers:
    
    is_phone_number(number) # checks out!

[(210) 867 5309].....VALID phone number.
[+1 210.867.5309].....VALID phone number.
[867-5309].....VALID phone number.
[210-867-5309].....VALID phone number.
[678!09].....NOT VALID phone number.
[+19, (222)].....NOT VALID phone number.


----

### ``Exercise Number 4: Use regular expressions to convert the dates below to the standardized year-month-day format.``

**<u>Sample Text:</u>**

* 02/04/19
* 02/05/19
* 02/06/19
* 02/07/19
* 02/08/19
* 02/09/19
* 02/10/19

In [74]:
# i want the function to take in a date
# function should transform/normalize the input date
# output date should be in the form of: year-month-day or YYYY-MM-DD

# in looking at the sample text i notice a few things:
# 1. string/date is the format of MM-DD-YY
# 2. I may need the datetime library.module to assist in the date cleaning process

from datetime import datetime

# let's create a list of sample dates to check 
date_lst = [ 
            '02/04/19',
            '02/05/19',
            '02/06/19',
            '02/07/19',
            '02/08/19',
            '02/09/19',
            '02/10/19',
            '2022-08-29' # would need to return to this line: function doesn't capture this correctly
            ]


# regex patterns to identify the type of date format
patterns = {

    '%m/%d/%y': r'^\s*\d{1,2}\/\d{1,2}\/\d{1,2}\s*$',
    '%Y/%m/%d': r'^\s*\d{4}\/\d{1,2}\/\d{1,2}\s*$',
    '%d/%m/%Y': r'^\s*\d{1,2}\/\d{1,2}\/\d{4}\s*$',
    '%Y-%m-%d': r'^\s*\d{4}-\d{1,2}-\d{1,2}\s*$',
    '%d-%m-%Y': r'^\s*\d{1,2}-\d{1,2}-\d{4}\s*$',
}


def normalize_dates(input_date):
    """
    Identify the format of input date and convert to datetime.
    Along with that track the status of conversion
    """
    try:

        for expected_format, pattern in patterns.items():

            if re.match(pattern, input_date):

                return 'CONVERTED', datetime.strptime(input_date, expected_format).date()

    except Exception as e:

        return 'EXCEPTION', datetime.strptime(input_date, '%Y-%m-%d').date()

    return 'NOT CONVERTED'

In [75]:
# ok, let's try the function returning the status of the date object passed and any conversion made
# expected outcome should be YYYY-MM-DD format

if __name__ == '__main__':
    
    for date in date_lst:

        status, date_obj = normalize_dates(date)
        
        print(f'Input Date: [{date}] [{status}] to [{date_obj}]')

Input Date: [02/04/19] [CONVERTED] to [2019-02-04]
Input Date: [02/05/19] [CONVERTED] to [2019-02-05]
Input Date: [02/06/19] [CONVERTED] to [2019-02-06]
Input Date: [02/07/19] [CONVERTED] to [2019-02-07]
Input Date: [02/08/19] [CONVERTED] to [2019-02-08]
Input Date: [02/09/19] [CONVERTED] to [2019-02-09]
Input Date: [02/10/19] [CONVERTED] to [2019-02-10]
Input Date: [2022-08-29] [CONVERTED] to [2022-08-29]


In [76]:
# alternative method borrowed from Codeup
# first, convert sample dates to a Pandas Series

dates = pd.Series(date_lst)
dates

0      02/04/19
1      02/05/19
2      02/06/19
3      02/07/19
4      02/08/19
5      02/09/19
6      02/10/19
7    2022-08-29
dtype: object

In [82]:
# next, use Pandas and regex to parse the dates and clean them

dates = dates.str.replace(r'(\d{2,4})/(\d{2})/(\d{2})', r'20\3-\1-\2', regex = True)
dates # all but one, checks out. return back to this for corrections

0    2019-02-04
1    2019-02-05
2    2019-02-06
3    2019-02-07
4    2019-02-08
5    2019-02-09
6    2019-02-10
7    2022-08-29
dtype: object

----
#### ``Exercise Number 5: Write a regex to extract the various parts of these logfile lines:``

* GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
* POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
* GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58


In [78]:
# let's convert the logfile lines into a string text

logs = [
'GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58', 
'POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58', 
'GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58']

In [79]:
# let's use the re.complile() function to help with parsing the log data

# what type of request is it? e.g., Get or Post?
# borrowing code block from Codeup

logfiles_re = r'''

^(?P<method>GET|POST)\s+    # checking the call type (optional)
    (?P<path>.*?)\s+    # checking the path/endpoint after the call type (optional)
        \[(?P<timestamp>.*?)\]\s+   # checking when the log/call was recorded (optional)
            (?P<http_version>.*?)\s+    # checking the http version for data transfer (optional)
                \{(?P<status>\d+)\}\s+  # checking the call status (optional)
                    (?P<bytes>\d+)\s+   # checking the file size in bytes (optional)
                        "(?P<user_agent>.*)"\s+     # checking user_agent or software/platform used to retrieve the data (optional)
                            (?P<ip>.*)$     # checking the IP address (optional)
'''

In [80]:
# let's now convert the logs into a pandas series

logfiles = pd.Series(logs)
logfiles

0    GET /api/v1/sales?page=86 [16/Apr/2019:193452+...
1    POST /users_accounts/file-upload [16/Apr/2019:...
2    GET /api/v1/items?page=3 [16/Apr/2019:193453+0...
dtype: object

In [81]:
# next, we'll use both pandas and regex to parse the logs data

logfiles = logfiles.str.extract(logfiles_re, re.VERBOSE)
logfiles # checks out!

Unnamed: 0,method,path,timestamp,http_version,status,bytes,user_agent,ip
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,python-requests/2.21.0,97.105.19.58
1,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,python-requests/2.21.0,97.105.19.58
