# Regex 2

In [None]:
#import statements
import re
from subprocess import check_output
import pandas as pd

In [None]:
# Example strings
# from DS100 book...
def reg(regex, text):
    """
    Prints the string with the regex match highlighted.
    """
    print(re.sub(f'({regex})', r'\033[1;30;43m\1\033[m', text))
s1 = " ".join(["A DAG is a directed graph without cycles.",
               "A tree is a DAG where every node has one parent (except the root, which has none).",
               "To learn more, visit www.example.com or call 1-608-123-4567. :) ¯\_(ツ)_/¯"])
print(s1)

s2 = """1-608-123-4567
a-bcd-efg-hijg (not a phone number)
1-608-123-456 (not a phone number)
608-123-4567
123-4567
1-123-4567 (not a phone number)
"""
print(s2)

s3 = "In CS 320, there are 10 quizzes, 7 projects, 39 lectures, and 1000 things to learn.  CS 320 is awesome!"
print(s3)

s4 = """In CS 320,  there are 14 quizzes,    7 projects,
41 lectures, and 1000 things to learn.  CS 320 is awesome!"""
print(s4)

In [None]:
print(s1)

### Regex is case sensitive

### Character classes

- Character classes can be mentioned within `[...]`
- `^` means `NOT` of a character class
- `-` enables us to mention range of characters, for example `[A-Z]`
- `|` enables us to perform `OR`

### Metacharacters

- predefined character classes
    - `\d` => digits
    - `\s` => whitespace (space, tab, newline)
    - `\w` => "word" characters (digits, letters, underscores, etc) --- helpful for variable name matches and whole word matches (as it doesn't match whitespace --- `\s`)
    - `.` => wildcard: anything except newline
- capitalized version of character classes mean `NOT`, for example `\D` => everything except digits

### REPETITION

- `<character>{<num matches>}` - for example: `w{3}`
- matches cannot overlap

### Variable length repitition operators

- `*` => 0 or more (greedy: match as many characters as possible)
- `+` => 1 or more (greedy: match as many characters as possible)
- `?` => 0 or 1
- `*?` => 0 or more (non-greedy: match as few characters as possible)
- `+?` => 1 or more (non-greedy: match as few characters as possible)

#### Find everything inside of parentheses.

In [None]:
# this doesn't work
# it captures everything because () have special meaning (coming up)
reg(r"", s1)

In [None]:
# How can we change this to not use special meaning of ()?
# * is greedy: match as many characters as possible
reg(r"(.*)", s1)

In [None]:
# non-greedy: stop at the first possible spot instead of the last possible spot
reg(r"\(.*\)", s1)

### Anchor characters
- `^` => start of string
    - `^` is overloaded --- what was the other usage?
- `$` => end of string

#### Find everything in the first sentence.

In [None]:
# doesn't work because remember regex finds all possible matches
# so it matches every single sentence 
# (even though we are doing non-greedy match)
reg(r"", s1)

In [None]:
reg(r".*?\.", s1)

#### Find everything in the first two sentences.

In [None]:
reg(r"", s1)

#### Find last "word" in the sentence.

In [None]:
reg(r"", s1)

### Case study: find all phone numbers.

In [None]:
print(s2)
# The country code (1) in the front is optional
# The area code (608) is also optional
# Doesn't make sense to match country code without area code though!

In [None]:
# Full US phone numbers
reg(r"", s2)

In [None]:
# The country code (1) in the front is optional
reg(r"", s2)

In [None]:
# The area code (608) is also optional
# Doesn't make sense to have country code without area code though!
reg(r"", s2)

In [None]:
# This is good enough for 320 quizzes/tests
# But clearly, the last match is not correct
reg(r"", s2)

Regex documentation link: https://docs.python.org/3/library/re.html.

In [None]:
# BONUS: negative lookbehind (I won't test this)
reg(r"(?<!\d\-)((\d-)?\d{3}-)?\d{3}-\d{4}", s2)

There is also a negative lookahead. For example, how to avoid matching "1-608-123-456" in "1-608-123-4569999". You can explore this if you are interested.

In [None]:
reg(r"(?<!\d\-)((\d-)?\d{3}-)?\d{3}-\d{4}", "608-123-4569999")

### Testing your regex
- you could use `reg(...)` function
- another useful resource: https://regex101.com/

### `re` module
- `re.findall(<PATTERN>, <SEARCH STRING>)`: regular expression matches
    - returns a list of strings 
- `re.sub(<PATTERN>, <REPLACEMENT>, <SEARCH STRING>)`: regular expression match + substitution
    - returns a new string with the substitutions (remember strings are immutable)

In [None]:
msg = "In CS 320,\tthere are 40 lectures, 10 quizzes, 3 exams,\t7 projects, and 1000 things to learn. CS 320 is awesome!"
print(msg)

#### Find all digits.

### Groups
- we can capture matches using `()` => this is the special meaning of `()`
- returns a list of tuples, where length of the tuple will be number of groups

#### Find all digits and the word that comes after that.

In [None]:
matches = re.findall(r"", msg)
matches

#### Goal: make a dict (course component => count, like "projects" => 7)

In [None]:
course_dict = {}
for count, component in matches:
    course_dict[component] = int(count)
course_dict

### Unlike matches, groups can overlap

#### Find and group all digits and the word that comes after that.

In [None]:
re.findall(r"(\d+) (\w+)", msg)

#### Substitute all digits with "###".

In [None]:
re.sub(r"", , msg)

#### Goal: normalize whitespace (everything will be a single space)

In [None]:
print(msg)

In [None]:
re.sub(r"", , msg)

### How to use groups is substitution?
- `\g<N>` gives you the result of the N'th grouping.

#### Substitute all course component counts with HTML bold tags.

In [None]:
print(re.sub(r"(\d+)", "<b></b>", msg))

In CS <b>320</b>, there are <b>40</b> lectures, <b>10</b> quizzes, <b>3</b> exams, <b>7</b> projects, and <b>1000</b> things to learn. CS <b>320</b> is awesome!

### Git log example

#### Run `git log` as a shell command

In [None]:
!git log

In [None]:
git_log_output = str(check_output(["git", "log"]), encoding="utf-8")
print(git_log_output[:500])

#### GOAL: find all the commit numbers

In [None]:
commits = re.findall(r"", git_log_output)
# recent 10 commit numbers
commits[:10]

#### What days of the week does Meena push things into this repo?

In [None]:
print(git_log_output[:500])

In [None]:
days = re.findall(r"", git_log_output)
days

#### Count unique days

In [None]:
day_counts = pd.Series(days).value_counts()
day_counts

#### Sort by day of the week

In [None]:
sorted_day_counts = day_counts.loc[["Mon", "Tue", "Wed", "Thu", "Fri", "Sun"]]
sorted_day_counts

#### Create a bar plot

In [None]:
ax = sorted_day_counts.plot.bar()
ax.set_ylabel("Commit counts")
ax.set_xlabel("Days of the week")

#### Find all commit autho names.

In [None]:
authors = re.findall(r"", git_log_output)
authors[0]

#### `git log` from projects repo

In [None]:
git_log_output = str(check_output(["git", "log"], cwd="../../cs320-s23-projects"), encoding="utf-8")
print(git_log_output[:1000])

In [None]:
re.findall(r"", git_log_output)

### Emails example

In [None]:
s = """
Meena [Instructor] - ms (AT) cs.wisc.edu
Yiyin [Head TA] - yshen82 (AT) wisc.edu
Ivan [TA] - ivanjaenm (AT) cs.wisc.edu
Mahitha [TA] - pillodi (AT) wisc.edu
Jiaqi [Peer Mentor]: jxia53 (AT) wisc.edu
Garrison [Peer Mentor]: gwaugh (AT) wisc.edu
"""
print(s)

In [None]:
name = r"\w+"
at = r"@|([\(\[]?[Aa][Tt][\)\]]?)"
domain = r"\w+\.(\w+\.)?(edu|com|org|net|io|gov)"

full_regex = f"(({name})\s*({at})\s*({domain}))"

re.findall(full_regex, s)

In [None]:
print("REGEX:", full_regex)
for match in re.findall(full_regex, s):
    print(match[1] + "@" + match[4])

### Self-practice

Q1: Which regex will NOT match "123"
1. r"\d\d\d"
2. r"\d{3}"
3. r"\D\D\D"
4. r"..."

Q2: What will r"^A" match?
1. "A"
2. "^A"
3. "BA"
4. "B"
5. "BB"

Q3: Which one can match "HH"?
1. r"HA+H"
2. r"HA+?H"
3. r"H(A+)?H"

Q4: Which string(s) will match r"^(ha)*$"
1. ""
2. "hahah"
3. "that"
4. "HAHA"

Q5: What is the type of the following?re.findall(r"(\d) (\w+)", some_str)[0]
1. list
2. tuple
3. string

Q6: What will it do?
```python
re.sub(r"(\d{3})-(\d{3}-\d{4})",
       r"(\g<1>) \g<2>",
       "608-123-4567")
```

I will post answers to self-practice questions as part of the next lecture notebook. Meanwhile, try to solve it using pen & paper. Then try to verify your answer using code.