The following code shows Python's text handling capabilities 
such as formatting and regular expressions. 

In [3]:
import string, textwrap, re, difflib
import inspect

In [3]:
values = {'Kenya':'Nairobi', 'Uganda':'Kampala', 'Tanzania':'Dodoma',
          'Ethiopia':'Addis Ababa','China':'Beijing','Egypt':'Cairo',
          'AncientEgypt':'Memphis','AncientChina':['Luoyang','Changan'],
          'Turkey':'Istanbul'}

someString = string.Template("""
The country of Kenya has the capital $Kenya while the 
country of Uganda has the capital $Uganda, while the capital of Ethiopia is $Ethiopia.
Egypt's capital is ${Egypt} but its ancient capital was ${AncientEgypt}.
China's administrative capital is ${China} but the twin ancient capitals were
${AncientChina} and ${AncientChina}.
The capital of Turkey is ${Turkey} while the capital of Greece is ${Greece}.
""")



print('TEMPLATE: ', someString.safe_substitute(values))

TEMPLATE:  
The country of Kenya has the capital Nairobi while the 
country of Uganda has the capital Kampala, while the capital of Ethiopia is Addis Ababa.
Egypt's capital is Cairo but its ancient capital was Memphis.
China's administrative capital is Beijing but the twin ancient capitals were
['Luoyang', 'Changan'] and ['Luoyang', 'Changan'].
The capital of Turkey is Istanbul while the capital of Greece is ${Greece}.



In [4]:
def is_str(value):
    return isinstance(value, str)
for name, value in inspect.getmembers(string, is_str):
    if (name.startswith('_')): #Uncomment this to see the difference:
        continue
    print(f'{name} ==> {value}')
    

ascii_letters ==> abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
ascii_lowercase ==> abcdefghijklmnopqrstuvwxyz
ascii_uppercase ==> ABCDEFGHIJKLMNOPQRSTUVWXYZ
digits ==> 0123456789
hexdigits ==> 0123456789abcdefABCDEF
octdigits ==> 01234567
printable ==> 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

punctuation ==> !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
whitespace ==>  	



In [5]:
try:
    print('TEMPLATE: ', someString.substitute(values))
except KeyError as err:
    print("Error: ", str(err))

Error:  'Greece'


In [6]:
def should_indent(line):
    return (len(line.strip()) % 2 == 0)

In [7]:
print("TEMPLATE: ==> ", textwrap.fill(someString.safe_substitute(values), width=50))
#Text will be left-justified
print()
print()
dedented_text = textwrap.dedent(someString.safe_substitute(values))
filled = textwrap.fill(dedented_text, width = 85)
final = textwrap.indent(filled, '>> ',predicate=should_indent)
print('Final Formatted text(with textwrap): ')
print(final)

TEMPLATE: ==>   The country of Kenya has the capital Nairobi
while the  country of Uganda has the capital
Kampala, while the capital of Ethiopia is Addis
Ababa. Egypt's capital is Cairo but its ancient
capital was Memphis. China's administrative
capital is Beijing but the twin ancient capitals
were ['Luoyang', 'Changan'] and ['Luoyang',
'Changan']. The capital of Turkey is Istanbul
while the capital of Greece is ${Greece}.


Final Formatted text(with textwrap): 
 The country of Kenya has the capital Nairobi while the  country of Uganda has the
capital Kampala, while the capital of Ethiopia is Addis Ababa. Egypt's capital is
>> Cairo but its ancient capital was Memphis. China's administrative capital is Beijing
but the twin ancient capitals were ['Luoyang', 'Changan'] and ['Luoyang', 'Changan'].
The capital of Turkey is Istanbul while the capital of Greece is ${Greece}.


In [8]:
pattern = "None-existent"
pattern2 = "pattern"
text = "Looking for a pattern in this text! You won't find one."
text2 = "Looking for a pattern here. Well, you will definitely find a pattern."
match = re.search(pattern2 , text2) #This returns a Match Object
if (match == None):
    print("Found no pattern here!")
else:
    print(f"Found \"{match.re.pattern}\" in the text:\n\"{text}\"\nfrom index {match.start()} to index {match.end()}.")
    print(f"{text2[match.start() : match.end()]}")

print("\nNow using re.findall():")
for match in re.findall(pattern2, text2):
    if (match == None):
        print("Found no pattern here!")
    else:
        print(f"Found match: \"{match}\" in the text:\n\"{text}\"")

print("\nNow using re.finditer():")
for match in re.finditer(pattern2, text2):
    if (match == None):
        print("Found no pattern here!")
    else:
        print(f"Found match: \"{match.re.pattern}\" in the text:\n\"{text}\"")
        print(f"Indices : {match.start()} and {match.end()}")

Found "pattern" in the text:
"Looking for a pattern in this text! You won't find one."
from index 14 to index 21.
pattern

Now using re.findall():
Found match: "pattern" in the text:
"Looking for a pattern in this text! You won't find one."
Found match: "pattern" in the text:
"Looking for a pattern in this text! You won't find one."

Now using re.finditer():
Found match: "pattern" in the text:
"Looking for a pattern in this text! You won't find one."
Indices : 14 and 21
Found match: "pattern" in the text:
"Looking for a pattern in this text! You won't find one."
Indices : 61 and 68


In [9]:
alphanumeric_password = "IgnobleSurreptitiously1234^"
patterns_for_good_password = ['[a-z]+?','[A-Z]+?', r'\S', r'\d']
special_characters = [r'[*&@#$%^()!~+]{1,}']
patterns_for_good_password += special_characters
print(f'Pattern: {patterns_for_good_password}')
regexObjList = [re.compile(pattern) for pattern in patterns_for_good_password]
print(f'RegexObject: {regexObjList}')
for regex in regexObjList:
    for match in regex.finditer(alphanumeric_password):
        if (match == None):
            print("Password failed our password policy.\nReason: ")
            if (match.re.pattern == '[a-z]+'):
                print("lowercase letters needed in password.")
            elif (match.re.pattern) == '[A-Z]+':
                print("Uppercase Letters needed in password.")
            elif (match.re.pattern == r'\S'):
                print("You can't have whitespace in your password.")
            elif (match.re.pattern == r'\d'):
                print("Digits/Numerals/Numbers are required in the password.")
            else:
                print("No special characters present in your password.", end = " ")
                print("It is a requirement to have at least three special characters for a good password.")
        else:
            start = match.start()
            end = match.end()
            print(f"Match found: Pattern '{match.re.pattern}' found in text:\n'{alphanumeric_password}'", end = '\t')
            print(f" => `{alphanumeric_password[start : end]}`")

Pattern: ['[a-z]+?', '[A-Z]+?', '\\S', '\\d', '[*&@#$%^()!~+]{1,}']
RegexObject: [re.compile('[a-z]+?'), re.compile('[A-Z]+?'), re.compile('\\S'), re.compile('\\d'), re.compile('[*&@#$%^()!~+]{1,}')]
Match found: Pattern '[a-z]+?' found in text:
'IgnobleSurreptitiously1234^'	 => `g`
Match found: Pattern '[a-z]+?' found in text:
'IgnobleSurreptitiously1234^'	 => `n`
Match found: Pattern '[a-z]+?' found in text:
'IgnobleSurreptitiously1234^'	 => `o`
Match found: Pattern '[a-z]+?' found in text:
'IgnobleSurreptitiously1234^'	 => `b`
Match found: Pattern '[a-z]+?' found in text:
'IgnobleSurreptitiously1234^'	 => `l`
Match found: Pattern '[a-z]+?' found in text:
'IgnobleSurreptitiously1234^'	 => `e`
Match found: Pattern '[a-z]+?' found in text:
'IgnobleSurreptitiously1234^'	 => `u`
Match found: Pattern '[a-z]+?' found in text:
'IgnobleSurreptitiously1234^'	 => `r`
Match found: Pattern '[a-z]+?' found in text:
'IgnobleSurreptitiously1234^'	 => `r`
Match found: Pattern '[a-z]+?' found in text

In [10]:
somePattern = [('ajs', 'feywfeywfe kahskhak'), ('abba','abba stands for something')]
for pattern, desc in somePattern:
    print(f"{pattern} => {desc}")

ajs => feywfeywfe kahskhak
abba => abba stands for something


In [11]:
text = "abbaabbba"
patterns = ['a+b*?', 'a{1,}b+?']
regexes = [re.compile(pattern) for pattern in patterns]
for regex in regexes:
    print(f"\nSeeking the pattern ({regex.pattern}) in our text: {text}\n")
    for match in regex.finditer(text):
        if match == None:
            print("No match found here.")
        else:
            s = match.start()
            e = match.end()
            print(f'Match found: "{match.re.pattern}"\nin "{match.string}"\nfrom index {s} to index {e} => {text[s:e]}')
        


Seeking the pattern (a+b*?) in our text: abbaabbba

Match found: "a+b*?"
in "abbaabbba"
from index 0 to index 1 => a
Match found: "a+b*?"
in "abbaabbba"
from index 3 to index 5 => aa
Match found: "a+b*?"
in "abbaabbba"
from index 8 to index 9 => a

Seeking the pattern (a{1,}b+?) in our text: abbaabbba

Match found: "a{1,}b+?"
in "abbaabbba"
from index 0 to index 2 => ab
Match found: "a{1,}b+?"
in "abbaabbba"
from index 3 to index 6 => aab


In [12]:
html_text = "<em>Hello World</em>"
html_pattern = r'</*\w+>'
regexObj = re.compile(html_pattern)
for match in regexObj.finditer(html_text):
    if (match == None):
        print("No Match found!")
    else:
        s = match.start()
        e = match.end()
        print(f"Match found: Pattern '{match.re.pattern}' found in\n{match.string}")
        print(f"from index {s} to index {e}: => {html_text[s:e]}")
        

Match found: Pattern '</*\w+>' found in
<em>Hello World</em>
from index 0 to index 4: => <em>
Match found: Pattern '</*\w+>' found in
<em>Hello World</em>
from index 15 to index 20: => </em>


In [13]:
sampleText = 'This is some text -- with punctuation!'
pattern1 = r'\B\w+\B'
regex1 = re.compile(pattern)
for match in regex.finditer(sampleText):
    if (match == None):
        print(f"No match found for pattern")
        #print(f": {match.re.pattern}")
    else:
        s = match.start()
        e = match.end()
        print("Match found for pattern ", end = "")
        print(f"{match.re.pattern}. ==> {sampleText[s:e]}")

In [15]:
#********************************************#
m = re.match("None", sampleText)
print("Match: ", m)

s = re.search("This", sampleText)
print(f"Search:  {s};", end= " ")
print(f"Pattern == '{s.re.pattern}'")

fullmatch = re.fullmatch("This", sampleText)
print("Fullmatch: ", fullmatch)
#*******************************************#
pattern2 = '([A-Z]+[a-z]+)+'
regex2 = re.compile(pattern2)
print("Our text: ", sampleText)
for match in regex.finditer(sampleText):
    if (match == None):
        print("No match found for pattern : ", end="")
        print(match.re.pattern)
    else:
        print(f"Match found: {sampleText[match.start() : match.end()]}")
pattern3 = r'(\bs\w+)(\W+(\w+))+'
regex3 = re.compile(pattern3)
for match in regex3.finditer(sampleText):
    if (match == None):
        print("No match found.")
    else:
        print("  ", match.groups())

Our text:  This is some text -- with punctuation!
   ('some', ' punctuation', 'punctuation')


In [19]:
email_pattern = """
# An address is supposed to be as follows: <username>@<domain-name>.<top level domain>
# The username accepts alphanumeric values (ASCII/Unicode characters plus numerals)
# as well as special characters like period (.), hyphen (-) and others like
# !#$%&'*+/=?^_`{|}~
# The usename/local part should not have a period at the beginning or the end of the username
# and there should be no consecutive/adjacent periods.
# The domain name can have uppercase or lowercase Latin letters (though often lowercase)
# digits as well, and hyphen provided it isn't the first letter.
(?P<username> [\w\d!#$%&'*+/=?^_`{|}~.-])                    # Username
@                                                            # at symbol
(?P<domain>[\w\d-]+)                                         # The domain name
\.
(?P<top_level_domain>(com|edu|io|co\.ke|co\.uk|co\.za))      # The top level domain

""" 
regex = re.compile(pattern, re.VERBOSE)
candidates = [
    u'nicknjihia55@gmail.com',
    u'nicholasnjihian@protonmail.com',
    'nicholasnjihia89+35@jkuat.ac.ke',
    'sean+parker42@yahoo.co.za'
    'marcusaurelius.150ad.@gmail.co.uk'
]
for candidate in candidates:
    print("Candidate: ".format(candidate))
    for match in regex.search(candidate):
        if (match == None):
            print("None")
        else:
            print("Match name :  ", match.groupdict()['username'], end = "  ")
            print(match.groupdict()['domain'], end = " ")
            print(match.groupdict()['top_level_domain'], end=" ")
            print(f". Actual text: {candidate[match.start() : match.end()]}")
            

Candidate: 


TypeError: 'NoneType' object is not iterable

In [23]:
sampleText2 = "<html> paragraph </html>"
regex = re.compile(r'<(?P<closing_tag>/*)(?P<tag>[a-z]+)>')
print(regex.sub(r'<\g<closing_tag>p>', sampleText2))

<p> paragraph </p>


In [25]:
text = """
Paragraph One

Paragraph Two

Paragraph Three


Paragraph Four



Paragraph Five

Paragraph Six
"""
for num, paragraph in enumerate(re.split(r'\n{2,}', text)):
    print(num, repr(paragraph))

0 '\nParagraph One'
1 'Paragraph Two'
2 'Paragraph Three'
3 'Paragraph Four'
4 'Paragraph Five'
5 'Paragraph Six\n'


In [23]:
A = "abcdefg"
B = "abcdef  abcdefgh"
s1 = difflib.SequenceMatcher(None, A, B)
match1 = s1.find_longest_match(0, len(A), 0, len(B))
print("Match A == ", match1.a)
print("Match B", match1.b)
print("Size == ".format(match1.size))
i, j, k = match1
print(f"i == {i}; j == {j} and k == {k}")
print(' A[a:a + size] = {!r}'.format(A[i:i + k]))
print(' B[b:b + size] = {!r}'.format(B[j:j + k]))

Match A ==  0
Match B 8
Size == 
i == 0; j == 8 and k == 7
 A[a:a + size] = 'abcdefg'
 B[b:b + size] = 'abcdefg'
