<a href="https://colab.research.google.com/github/olcaykursun/Formal-Languages-and-Automata-Theory/blob/main/Spring23_important_regex_examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Intro to regular expressions examples, CSCI6200, Dr. Kursun, Spring 2023, AUM Computer Science

#Some resources:
#https://youtu.be/K8L6KVGG-7o  Nice youtube video for starters 
#https://docs.python.org/3/howto/regex.html  
#https://web.mit.edu/hackl/www/lab/turkshop/slides/regex-cheatsheet.pdf
#https://developers.google.com/edu/python/regular-expressions
#https://github.com/CoreyMSchafer/code_snippets/tree/master/Python-Regular-Expressions


In [2]:
import re

members = '''aba
abb
abc

ddd'''

nonmembers = '''abcd
ac
aaaa
a
b
ca
aab'''

pattern = re.compile(r'^(aba|abb|abc||ddd)$', re.MULTILINE)   
#accept L = {aba,abb,abc,lambda(empty-string),ddd}

#let us check if only the member strings are matched with the expression
matches = set([x.group(0) for x in pattern.finditer(members+'\n'+nonmembers)])
print(matches == set(members.split('\n')))
print(matches)

True
{'', 'ddd', 'abc', 'aba', 'abb'}


In [3]:
import re

members = '''abbcbcaaa
abaaaa
cccbbbaaa
cbcaaaaaa
aaa
ccbaaa
cccaaa'''

nonmembers = '''abc
ac
aaaa
a

cccbbbbbba
aaaab'''

pattern = re.compile(r'^([abc]{3})*aaa$', re.MULTILINE)   
#strings over {a,b,c} that end with aaa and the length must be a multiple of 3

#let us check if only the member strings are matched with the expression
matches = set([x.group(0) for x in pattern.finditer(members+'\n'+nonmembers)])
print(matches == set(members.split('\n')))
print(matches)

True
{'abbcbcaaa', 'cccbbbaaa', 'ccbaaa', 'cbcaaaaaa', 'aaa', 'abaaaa', 'cccaaa'}


In [4]:
import re

members = '''abbcbcabb
abaaba
cccbbbaaaccc
cbcaaaaaacbc
aaa
aba
abc
ccbccb
cccccc'''

nonmembers = '''abcc
ac
aaaa
a

cccbbbbbba
aaaab'''

pattern = re.compile(r'^([abc]{3})(([abc]{3})*\1)?$', re.MULTILINE)   
#All strings over the alphabet {a,b,c} whose 
#first three symbols (prefix) match the last three symbols (suffix)
#and the length is a multiple of 3

#let us check if only the member strings are matched with the expression
matches = set([x.group(0) for x in pattern.finditer(members+'\n'+nonmembers)])
print(matches == set(members.split('\n')))
print(matches)

True
{'abaaba', 'ccbccb', 'cccbbbaaaccc', 'abc', 'aba', 'cbcaaaaaacbc', 'cccccc', 'abbcbcabb', 'aaa'}


In [5]:
import re

members = '''abbcbcabb
abaaba
cccbbbaaaccc
cbcaaaaaacbc
aaa
zzz
123aba123
___
ccbccb
___aaa___bcb___
cccccc'''

nonmembers = '''abcc
ac
aaaa
a

cccbbbbbba
aaaab'''

pattern = re.compile(r'^(\w{3})((\w{3})*\1)?$', re.MULTILINE)   
#All strings over the alphabet a-Z, A-Z, 0-9, and underscore whose 
#first three symbols (prefix) match the last three symbols (suffix)
#and the length is a multiple of 3

#let us create "group-1" via the parantheses (\w{3}) and then force it by \1
#but of course we have "?" that allows "abc" because "abc" starts with "abc" and ends with "abc"
#so, we should accept all length-3 strings by default and "?" helps with that.

#let us check if only the member strings are matched with the expression
matches = set([x.group(0) for x in pattern.finditer(members+'\n'+nonmembers)])
print(matches == set(members.split('\n')))
print(matches)


True
{'abaaba', 'ccbccb', '___', 'cccbbbaaaccc', '___aaa___bcb___', '123aba123', 'cbcaaaaaacbc', 'cccccc', 'abbcbcabb', 'aaa', 'zzz'}


In [6]:
#alphabet of {a,b,c} and it ends with aaa and the length is NOT a multiple of 3

import re

members = '''abbcbcbaaa
abaccaaa
cccbbbcbaaa
cbcaaaaaaa
aaaaa
aaaa
caaa
cccbaaa'''

nonmembers = '''abc
ac
aaa
a

cccaaa
aaaaaa
daaa'''

#All of the following patterns are good
#pattern = re.compile(r'^([abc]{3})*[abc]{1,2}aaa$', re.MULTILINE)   
#pattern = re.compile(r'^([abc]{3})*(a|b|c|aa|ab|ac|ba|bb|bc|ca|cb|cc)aaa$', re.MULTILINE)   
pattern = re.compile(r'^([abc]{3})*([abc]|[abc]{2})aaa$', re.MULTILINE)

#alphabet of {a,b,c} and it ends with aaa and the length is NOT a multiple of 3

#let us check if only the member strings are matched with the expression
matches = set([x.group(0) for x in pattern.finditer(members+'\n'+nonmembers)])
print(matches == set(members.split('\n')))
print(matches)


True
{'abaccaaa', 'cccbaaa', 'aaaaa', 'cccbbbcbaaa', 'aaaa', 'abbcbcbaaa', 'cbcaaaaaaa', 'caaa'}


In [7]:
#alphabet of {a,b,c} and it ends with aaa and the length is even

import re

members = '''bbcbcaaa
abaaaa
ccbbbaaa
cbcaaaaa
aaaa
aaaaaa
ccbaaa
cccaaa'''

nonmembers = '''abc
ac
aaaaa
a

cccbbbbbba
aaaab'''

pattern = re.compile(r'^([abc]{2})*[abc]aaa$', re.MULTILINE)   
#alphabet of {a,b,c} and it ends with aaa and the length is even

#let us check if only the member strings are matched with the expression
matches = set([x.group(0) for x in pattern.finditer(members+'\n'+nonmembers)])
print(matches == set(members.split('\n')))
print(matches)


True
{'bbcbcaaa', 'ccbbbaaa', 'ccbaaa', 'cbcaaaaa', 'aaaaaa', 'abaaaa', 'aaaa', 'cccaaa'}


In [8]:
#Although {ww | w is element of {a,b}*} is not regular, we can write a regular expression for it
#Because we are implicitly accepting the fact that Regex will simply crash in a sense if we run out of memory
#So, in a way length of w is bounded by a large number for the regex below

#alphabet of {a,b,c} and if it contains any a's then it has even length, if it does
#not contain any a's then it has odd lenth

import re

members = '''abab
acbacb
cbaacbaa

aaaa
aaaaaa'''

nonmembers = '''a
bbb
aaaaa
aaabb
baaac
bc
bba
cccb'''

pattern = re.compile(r'^(\w*)\1$', re.MULTILINE)   
matches = set([x.group(0) for x in pattern.finditer(members+'\n'+nonmembers)])
print(matches == set(members.split('\n')))
print(matches)


True
{'', 'acbacb', 'cbaacbaa', 'aaaaaa', 'aaaa', 'abab'}
