In [1]:
import re

## (1) Regex in customer support

In [3]:
text_chat1='''
Hello I am having an issue with my order # 493792739292
I have a problem with order number 3983029809328
My order 412889912 is having an issue, I was charged 300$ when online it says 280$
'''

In [4]:
pattern_chat1='order[^\d]+([\d]*)'
re.findall(pattern_chat1,text_chat1)

['493792739292', '3983029809328', '412889912']

In [9]:
chat1 = 'codebasics: you ask lot of questions 😠  1235678912, abcA@Gmail.com'
chat2 = 'codebasics: here it is: (123)-567-8912, abcX_82@yahoo.co.in'
chat3 = 'codebasics: yes, phone: 1235678912 email: abc@Facebook.com'

In [13]:
pattern_phone='\(\d{3}\)-\d{3}-\d{4}|\d{10}'
re.findall(pattern_phone,chat1)

['1235678912']

In [15]:
re.findall(pattern_phone,chat2)

['(123)-567-8912']

In [16]:
re.findall(pattern_phone,chat3)

['1235678912']

In [17]:
pattern_email='[a-z0-9A-Z_]*@[^.]*.[^\s]+'

In [18]:
re.findall(pattern_email,chat1)

['abcA@Gmail.com']

In [19]:
re.findall(pattern_email,chat2)

['abcX_82@yahoo.co.in']

In [20]:
re.findall(pattern_email,chat3)

['abc@Facebook.com']

## (2) Regex for Information Extraction

In [21]:
text='''
Born	Elon Reeve Musk
June 28, 1971 (age 50)
Pretoria, Transvaal, South Africa
Citizenship	
South Africa (1971–present)
Canada (1971–present)
United States (2002–present)
Education	University of Pennsylvania (BS, BA)
Title	
Founder, CEO and Chief Engineer of SpaceX
CEO and product architect of Tesla, Inc.
Founder of The Boring Company and X.com (now part of PayPal)
Co-founder of Neuralink, OpenAI, and Zip2
Spouse(s)	
Justine Wilson
​
​(m. 2000; div. 2008)​
Talulah Riley
​
​(m. 2010; div. 2012)​
​
​(m. 2013; div. 2016)
'''

In [22]:
pattern_age = 'age (\d+)'
re.findall(pattern_age,text)

['50']

In [25]:
pattern_name = 'Born([^\n]+)'
matches = re.findall(pattern_name,text)
matches[0].strip()

'Elon Reeve Musk'

In [28]:
## for getting DOB
pattern_dob='Born.*\n(.*)\(age'
re.findall(pattern_dob,text)

['June 28, 1971 ']

In [30]:
# for getting place of birth - next line of age
pattern_birth_place='age.*\n(.*)'
re.findall(pattern_birth_place,text)

['Pretoria, Transvaal, South Africa']

In [35]:
def get_pattern_match(pattern, text):
    matches = re.findall(pattern, text)
    if matches:
        return matches[0]

In [38]:
get_pattern_match(r'Born(.*)\n', text).strip()

'Elon Reeve Musk'

In [37]:
get_pattern_match(r'age (\d+)', text)

'50'

In [39]:
get_pattern_match(r'Born.*\n(.*)\(age', text).strip()

'June 28, 1971'

In [40]:
get_pattern_match(r'\(age.*\n(.*)', text)

'Pretoria, Transvaal, South Africa'

In [41]:
def extract_personal_information(text):
    age = get_pattern_match('age (\d+)', text)
    full_name = get_pattern_match('Born(.*)\n', text)
    birth_date = get_pattern_match('Born.*\n(.*)\(age', text)
    birth_place = get_pattern_match('\(age.*\n(.*)', text)
    return {
        'age': int(age),
        'name': full_name.strip(),
        'birth_date': birth_date.strip(),
        'birth_place': birth_place.strip()
    }

In [42]:
extract_personal_information(text)

{'age': 50,
 'name': 'Elon Reeve Musk',
 'birth_date': 'June 28, 1971',
 'birth_place': 'Pretoria, Transvaal, South Africa'}

In [43]:
text = '''
Born	Mukesh Dhirubhai Ambani
19 April 1957 (age 64)
Aden, Colony of Aden
(present-day Yemen)[1][2]
Nationality	Indian
Alma mater	
St. Xavier's College, Mumbai
Institute of Chemical Technology (B.E.)
Stanford University (drop-out)
Occupation	Chairman and MD, Reliance Industries
Spouse(s)	Nita Ambani ​(m. 1985)​[3]
Children	3
Parent(s)	
Dhirubhai Ambani (father)
Kokilaben Ambani (mother)
Relatives	Anil Ambani (brother)
Tina Ambani (sister-in-law)
'''

In [44]:
extract_personal_information(text)

{'age': 64,
 'name': 'Mukesh Dhirubhai Ambani',
 'birth_date': '19 April 1957',
 'birth_place': 'Aden, Colony of Aden'}

### 1. Extract all twitter handles from following text. Twitter handle is the text that appears after https://twitter.com/ and is a single word. Also it contains only alpha numeric characters i.e. A-Z a-z , o to 9 and underscore _

In [45]:
text = '''
Follow our leader Elon musk on twitter here: https://twitter.com/elonmusk, more information 
on Tesla's products can be found at https://www.tesla.com/. Also here are leading influencers 
for tesla related news,
https://twitter.com/teslarati
https://twitter.com/dummy_tesla
https://twitter.com/dummy_2_tesla
'''
pattern = 'https:\/\/twitter.com\/([a-zA-Z0-9_]*)' # todo: type your regex here

re.findall(pattern, text)

['elonmusk', 'teslarati', 'dummy_tesla', 'dummy_2_tesla']

### 2. Extract Concentration Risk Types. It will be a text that appears after "Concentration Risk:", In below example, your regex should extract these two strings

(1) Credit Risk

(2) Supply Rish

In [46]:
text = '''
Concentration of Risk: Credit Risk
Financial instruments that potentially subject us to a concentration of credit risk consist of cash, cash equivalents, marketable securities,
restricted cash, accounts receivable, convertible note hedges, and interest rate swaps. Our cash balances are primarily invested in money market funds
or on deposit at high credit quality financial institutions in the U.S. These deposits are typically in excess of insured limits. As of September 30, 2021
and December 31, 2020, no entity represented 10% or more of our total accounts receivable balance. The risk of concentration for our convertible note
hedges and interest rate swaps is mitigated by transacting with several highly-rated multinational banks.
Concentration of Risk: Supply Risk
We are dependent on our suppliers, including single source suppliers, and the inability of these suppliers to deliver necessary components of our
products in a timely manner at prices, quality levels and volumes acceptable to us, or our inability to efficiently manage these components from these
suppliers, could have a material adverse effect on our business, prospects, financial condition and operating results.
'''
pattern = 'Concentration of Risk: ([^\n]*)' # todo: type your regex here

re.findall(pattern, text)

['Credit Risk', 'Supply Risk']

### 3. Companies in europe reports their financial numbers of semi annual basis and you can have a document like this. To exatract quarterly and semin annual period you can use a regex as shown below

Hint: you need to use (?:) here to match everything enclosed

In [47]:
text = '''
Tesla's gross cost of operating lease vehicles in FY2021 Q1 was $4.85 billion.
BMW's gross cost of operating vehicles in FY2021 S1 was $8 billion.
'''

pattern = 'FY(\d+ [Q|S]\d)' # todo: type your regex here
matches = re.findall(pattern, text)
matches

['2021 Q1', '2021 S1']