#### Regex in NLP

In [1]:
import re

In [18]:
chat1 = "sdfksjfk 1234567899 1134567899 (113)-456-7899"
chat2 = "abc@xyz.com abX_82@xyz.com"
chat3 = "order # 41243443 order number 41243443 order 41243443"

In [14]:
pattern = "\d{10}|\(\d{3}\)-\d{3}-\d{4}"
matches = re.findall(pattern, chat1)
matches

['1234567899', '1134567899', '(113)-456-7899']

In [21]:
pattern = "[aA-zZ0-9_]*@[aA-zZ0-9]*\.[aA-zZ]*"
emails = re.findall(pattern, chat2)
emails 

['abc@xyz.com', 'abX_82@xyz.com']

In [22]:
pattern = "order[^\d]*(\d*)"
order = re.findall(pattern, chat3)
order

['41243443', '41243443', '41243443']

In [33]:
text = '''Pichai in 2020
Born	Pichai Sundararajan
June 10, 1972 (age 50)
Madurai, Tamil Nadu, India
Citizenship	United States[1]
Education	IIT Kharagpur (BTech)
Stanford University (MS)
University of Pennsylvania (MBA)
Title	CEO of Alphabet and Google
Board member of	
Alphabet Inc.[2]
Magic Leap (2014–2018)[3]
Spouse(s)	Anjali Pichai
Children	2
Awards	IND Padma Bhushan BAR.png Padma Bhushan
Signature'''

In [27]:
pattern = "age (\d+)"
age = re.findall(pattern, text)
age

['50', '50']

In [28]:
pattern = "Born(.*)"
name = re.findall(pattern, text)
name

['\tPichai Sundararajan']

In [30]:
pattern = "Born.*\n(.*)\(age"
date = re.findall(pattern, text)
date[0].strip()

'June 10, 1972'

In [35]:
pattern = "age.*\n(.*)"
place = re.findall(pattern, text)
place

['Madurai, Tamil Nadu, India']

In [40]:
def get_pattern_match(pattern, text):
    matches = re.findall(pattern, text)
    if matches:
        return matches[0]

def get_personal_info(text):
    age = get_pattern_match("age (\d+)", text)
    name = get_pattern_match("Born(.*)", text)
    birthdate = get_pattern_match("Born.*\n(.*)\(age", text)
    place = get_pattern_match("age.*\n(.*)", text)
    return {
        "name": name.strip(),
        "age": int(age),
        "birthdate": birthdate.strip(),
        "place": place.strip()
    }

In [39]:
get_pattern_match("age.*\n(.*)", text)

'Madurai, Tamil Nadu, India'

In [41]:
get_personal_info(text)

{'name': 'Pichai Sundararajan',
 'age': 50,
 'birthdate': 'June 10, 1972',
 'place': 'Madurai, Tamil Nadu, India'}

#### Find twitter handles

In [43]:
text = '''
Follow our leader Elon musk on twitter here: https://twitter.com/elonmusk, more information 
on Tesla's products can be found at https://www.tesla.com/. Also here are leading influencers 
for tesla related news,
https://twitter.com/teslarati
https://twitter.com/dummy_tesla
https://twitter.com/dummy_2_tesla
'''
pattern = 'https://twitter.com/([aA-zZ0-9_]*)' 

twitter_handle = re.findall(pattern, text)
twitter_handle

['elonmusk', 'teslarati', 'dummy_tesla', 'dummy_2_tesla']

#### Extract types of concentration of risk

In [44]:
text = '''
Concentration of Risk: Credit Risk
Financial instruments that potentially subject us to a concentration of credit risk consist of cash, cash equivalents, marketable securities,
restricted cash, accounts receivable, convertible note hedges, and interest rate swaps. Our cash balances are primarily invested in money market funds
or on deposit at high credit quality financial institutions in the U.S. These deposits are typically in excess of insured limits. As of September 30, 2021
and December 31, 2020, no entity represented 10% or more of our total accounts receivable balance. The risk of concentration for our convertible note
hedges and interest rate swaps is mitigated by transacting with several highly-rated multinational banks.
Concentration of Risk: Supply Risk
We are dependent on our suppliers, including single source suppliers, and the inability of these suppliers to deliver necessary components of our
products in a timely manner at prices, quality levels and volumes acceptable to us, or our inability to efficiently manage these components from these
suppliers, could have a material adverse effect on our business, prospects, financial condition and operating results.
'''
pattern = 'Concentration of Risk: ([aA-zZ\s]*)\n'

re.findall(pattern, text)

['Credit Risk', 'Supply Risk']

In [46]:
text = '''
Tesla's gross cost of operating lease vehicles in FY2021 Q1 was $4.85 billion.
BMW's gross cost of operating vehicles in FY2021 S1 was $8 billion.
'''

pattern = 'FY(\d{4}\s[A-Z][0-9])' 
# pattern = "FY(\d{4} (?:Q[1-4]|S[1-2]))"
matches = re.findall(pattern, text)
matches

['2021 Q1', '2021 S1']