# REGEX - Regular expressions



In [1]:
import re
import pandas as pd
import time

In [2]:
text = "The quick brown fox jumps over the lazy dog. The dog barks."

In [3]:
# 1. Find all occurrences of words containing "o"

words_with_o = re.findall(r'\b\w*o\w*\b', text)
print(words_with_o)

['brown', 'fox', 'over', 'dog', 'dog']


In [5]:
# 2.Search for the first occurrence of "dog"
match = re.search(r'\bdog\b', text)
if match:
    print("Found 'dog' at position:", match.start())
else:
    print("No match found.")

Found 'dog' at position: 40


In [6]:
match

<re.Match object; span=(40, 43), match='dog'>

In [7]:
# 3. re.split(): Split the text into sentences
sentences = re.split(r'\.', text)
print(sentences)

['The quick brown fox jumps over the lazy dog', ' The dog barks', '']


In [8]:
# 4. re.sub(): Replace "dog" with "cat"
new_text = re.sub(r'\bdog\b', 'cat', text)
print(new_text)

The quick brown fox jumps over the lazy cat. The cat barks.


### Validating Indian Mobile Numbers:

In [10]:
indian_mobile_pattern = r'^\+91[789]\d{9}$'

In [11]:
numbers = [
        "+917890123456",
        "+917654321098",
        "+919876543210",
        "+911234567890",  # Incorrect, starts with +91 but the next digit is not 7, 8, or 9
        "+9176543210",    # Incorrect, does not have 10 digits after +91
        "+9187654321090"  # Incorrect, has more than 10 digits after +91
    ]

for number in numbers:
    if re.match(indian_mobile_pattern, number):
        print(f"{number} is a valid Indian mobile number.")
    else:
        print(f"{number} is NOT a valid Indian mobile number.")


+917890123456 is a valid Indian mobile number.
+917654321098 is a valid Indian mobile number.
+919876543210 is a valid Indian mobile number.
+911234567890 is NOT a valid Indian mobile number.
+9176543210 is NOT a valid Indian mobile number.
+9187654321090 is NOT a valid Indian mobile number.


Breaking Down the REGEX



1.   Start of String (^): The ^ ensures that the phone number starts with +91.
2. End of String (\$): The $ ensures that the phone number ends after exactly 10 digits following +91.
3.   \+91: Matches "+91" (country code)
4. [789]: Matches the digit 7, 8, or 9.
5. \d{9}: Matches exactly 9 digits.

Validates whether each phone number matches the pattern of an Indian mobile number.

### Finding Domain from Website URL

In [12]:
# Function to extract domain from a given row in the DataFrame
def find_domain(row):
    pattern = r'(https?://)?(www\.)?([\w-]+(\.[a-z]{2,}){1,2})'

    result = re.search(pattern, row['Description'])
    return result.group(3)


In [13]:
data = {
        'Description': [
            'Check out the latest news on https://www.bbc.com/news/world',
            'Read our blog at http://blog.example.co.uk for more insights.',
            'Visit our corporate site at https://www.corporate-site.com/about-us',
            'For tech articles, see https://medium.com/tech-stories/article-1',
            'Our portfolio is showcased at http://portfolio.mywork.net',
            'Contact us via our page https://www.contact-page.org/contact',
            'Latest updates available at https://updates.example.com',
            'Watch videos on https://www.youtube-videos.io',
            'Check out recipes on http://www.cooking-today.tv',
            'Educational resources available at https://www.learn-more.edu/resources'
        ]
    }

df = pd.DataFrame(data)

In [16]:
df

Unnamed: 0,Description
0,Check out the latest news on https://www.bbc.c...
1,Read our blog at http://blog.example.co.uk for...
2,Visit our corporate site at https://www.corpor...
3,"For tech articles, see https://medium.com/tech..."
4,Our portfolio is showcased at http://portfolio...
5,Contact us via our page https://www.contact-pa...
6,Latest updates available at https://updates.ex...
7,Watch videos on https://www.youtube-videos.io
8,Check out recipes on http://www.cooking-today.tv
9,Educational resources available at https://www...


In [17]:
# df['Domain'] = df['Description'].apply(find_domain)
df['Domain'] = df.apply(find_domain, axis=1)

In [18]:
df

Unnamed: 0,Description,Domain
0,Check out the latest news on https://www.bbc.c...,bbc.com
1,Read our blog at http://blog.example.co.uk for...,blog.example.co
2,Visit our corporate site at https://www.corpor...,corporate-site.com
3,"For tech articles, see https://medium.com/tech...",medium.com
4,Our portfolio is showcased at http://portfolio...,portfolio.mywork.net
5,Contact us via our page https://www.contact-pa...,contact-page.org
6,Latest updates available at https://updates.ex...,updates.example.com
7,Watch videos on https://www.youtube-videos.io,youtube-videos.io
8,Check out recipes on http://www.cooking-today.tv,cooking-today.tv
9,Educational resources available at https://www...,learn-more.edu


In [10]:
# Function to create a DataFrame and extract domains
def func2():
    data = {
        'Description': [
            'Check out the latest news on https://www.bbc.com/news/world',
            'Read our blog at http://blog.example.co.uk for more insights.',
            'Visit our corporate site at https://www.corporate-site.com/about-us',
            'For tech articles, see https://medium.com/tech-stories/article-1',
            'Our portfolio is showcased at http://portfolio.mywork.net',
            'Contact us via our page https://www.contact-page.org/contact',
            'Latest updates available at https://updates.example.com',
            'Watch videos on https://www.youtube-videos.io',
            'Check out recipes on http://www.cooking-today.tv',
            'Educational resources available at https://www.learn-more.edu/resources'
        ]
    }

    df = pd.DataFrame(data)

    # Apply the function to extract domain names
    df['Domain'] = df.apply(find_domain, axis=1)

    print(df)


In [11]:
func2()

                                         Description                Domain
0  Check out the latest news on https://www.bbc.c...               bbc.com
1  Read our blog at http://blog.example.co.uk for...       blog.example.co
2  Visit our corporate site at https://www.corpor...    corporate-site.com
3  For tech articles, see https://medium.com/tech...            medium.com
4  Our portfolio is showcased at http://portfolio...  portfolio.mywork.net
5  Contact us via our page https://www.contact-pa...      contact-page.org
6  Latest updates available at https://updates.ex...   updates.example.com
7      Watch videos on https://www.youtube-videos.io     youtube-videos.io
8   Check out recipes on http://www.cooking-today.tv      cooking-today.tv
9  Educational resources available at https://www...        learn-more.edu


Understanding the Pattern

1. (https?://)?: Optionally matches "http://" or "https://".
2. (www\.)?: Optionally matches "www.".
3. ([\w-]+(\.[a-z]{2,}){1,2}): Matches the domain and its extensions.

# Vectorization

In [21]:
import numpy as np

In [22]:
def add(a,b):
  return a + b

def func1():
    """
    Vectorization using numpy
    """
    # Create two arrays
    a = np.array([10, 25, 29, 256])
    b = np.array([5, 6, 7, 8])

    # Add corresponding elements of two arrays
    # broadcasting in numpy is also vectorization
    addition = a + b
    print(addition )

    # find sqaure root
    square_root = np.sqrt(a)
    print(f"Square Root: {square_root}")

    # find exponential of each element of the array
    exponential = np.exp(b)
    print(exponential)

    # addition using vectorize
    # The purpose of np.vectorize is to transform functions which are not
    # numpy-aware (e.g. take floats as input and return floats as output)
    # into functions that can operate on (and return) numpy arrays.
    Vecfunc = np.vectorize(add,otypes=[float])
    print(Vecfunc(1,8))


In [23]:
func1()

[ 15  31  36 264]
Square Root: [ 3.16227766  5.          5.38516481 16.        ]
[ 148.4131591   403.42879349 1096.63315843 2980.95798704]
9.0


In [14]:
def func2():
    """
    Pandas vectorization
    """
    # Create a Pandas DataFrame
    df = pd.DataFrame({
        'A': [1, 2, 3, 4],
        'B': [5, 6, 7, 8],
        'C': [9, 10, 11, 12]
    })

    """
    Internally, Pandas Series are often stored as NumPy arrays, in this
    case arrays of floats. Pandas is smart enough to pass the operation
    on to the underlying arrays. No slow Python code is involved in doing
    the arithmetic.
    """
    df['Sum'] = df['A'] + df['B']
    print(df)



    """
    In contrast, the non-vectorized method calls a Python function for every
    row, and that Python function does additional operations.
    Eventually this devolves into low-level multiplication and division,
    but there is slow and expensive Python code being called repeatedly
    for every single row.
    """

    sentences = pd.Series([
    "This is a sample sentence.",
    "Pandas is great for data manipulation.",
    "Vectorization makes operations faster.",
    "I love working with Python and Pandas."
    ])

    """
    Pandas provides a .str object on Series that lets you run
    various vectorized operations on strings.
    """

    word_counts = sentences.str.split().str.len()

    # Create a new DataFrame to display the sentences and their word counts
    sentence_word_counts = pd.DataFrame({
        'Sentence': sentences,
        'Word Count': word_counts
    })


    print(sentence_word_counts)

In [24]:
def count_loop(X, target):
    start_time = time.time()
    count = sum(x == target for x in X["numbers"])
    end_time = time.time()
    return count, (end_time - start_time) * 1000

def count_vectorized(X, target):
    start_time = time.time()
    count = (X["numbers"] == target).sum()
    end_time = time.time()
    return count, (end_time - start_time) * 1000

def func3():

    data = {'numbers': np.random.randint(1, 10, size=1000000)}

    df = pd.DataFrame(data)

    count_loop_result, loop_time = count_loop(df, 2)
    print("Occurrences using loop:", count_loop_result)
    print("Execution time (loop): {:.3f} milliseconds".format(loop_time))

    # Perform counting with vectorized operation
    count_vectorized_result, vectorized_time = count_vectorized(df, 2)
    print("Occurrences using vectorized operation:", count_vectorized_result)
    print("Execution time (vectorized): {:.3f} milliseconds".format(vectorized_time))

    """
    In above code, count_loop() and count_vectorized(), both functions are
    counting occurrence of a particular integer in data frame column.
    by running code, it is seen that count_vectorized() function is getting
    executed faster than ciunt_loop() because of vectorization
    """

func3()

Occurrences using loop: 111665
Execution time (loop): 172.830 milliseconds
Occurrences using vectorized operation: 111665
Execution time (vectorized): 1.988 milliseconds


In [None]:
def func4():
    data = {'Student': [f'Student_{i}' for i in range(1, 10)]}
    df = pd.DataFrame(data)

    print(df.head())


func4()