# Python Regex Vignette

December 26, 2022

Vignette for Regex techniques.

@author: Oscar A. Trevizo

### References
1. https://docs.python.org/3/library/re.html
1. https://docs.python.org/3/howto/regex.html


# Library

In [1]:
import re

# Get data

In [3]:
s = "hello, world"

# Instantiate Regex rule

In [6]:
reg_alpha = re.compile(r'[A-Za-z]')

# Test instantiated Regex rule

In [7]:
reg_alpha.findall(s)

['h', 'e', 'l', 'l', 'o', 'w', 'o', 'r', 'l', 'd']

# Change the data and test it

In [8]:
s = "h###ello,,,,worl3.1416d$$$$"
reg_alpha.findall(s)

['h', 'e', 'l', 'l', 'o', 'w', 'o', 'r', 'l', 'd']

# New rule, add * and test it

In [11]:
reg_alpha_plus = re.compile(r'[A-Za-z]*')
reg_alpha_plus.findall(s)

['h',
 '',
 '',
 '',
 'ello',
 '',
 '',
 '',
 '',
 'worl',
 '',
 '',
 '',
 '',
 '',
 '',
 'd',
 '',
 '',
 '',
 '',
 '']

# Instatiate new rule, numeric digits

In [14]:
reg_digits = re.compile(r'[0-9]')

# Test it

In [16]:
n ='pi is 3.1416'

reg_digits.findall(n)

['3', '1', '4', '1', '6']

# Codes to automate rules

<ul>
  <li>\d Any numeric digit from 0 to 9.</li>
  <li>\w Any letter, numeric digit, or the underscore character</li>
  <li>\s Any space, tab, or newline character.</li>
</ul>


In [12]:
text ='My id is 75555688'
reg = re.compile(r'\d*')
reg.findall(text)

['', '', '', '', '', '', '', '', '', '75555688', '']

In [13]:
text ='My id is 75555688'
reg = re.compile(r'\w*')
reg.findall(text)

['My', '', 'id', '', 'is', '', '75555688', '']

In [14]:
text ='My id is 75555688'
reg = re.compile(r'\s')
reg.findall(text)

[' ', ' ', ' ']

In [15]:
text ='The price is 12.99'
reg = re.compile(r'\d*')
reg.findall(text)

['', '', '', '', '', '', '', '', '', '', '', '', '', '12', '', '99', '']

## My own

In [13]:
import pandas as pd
import numpy as np

In [23]:
df = pd.DataFrame({'n':[1, '#', 3], 's':['a', 'b', '#']})

In [24]:
df.head()

Unnamed: 0,n,s
0,1,a
1,#,b
2,3,#


In [27]:
df['n'] = df['n'].replace('[^A-Za-z0-9]', np.NaN, regex = True)
df.head()

Unnamed: 0,n,s
0,1.0,a
1,,b
2,3.0,#


In [28]:
df['s'] = df['s'].replace('[^A-Za-z0-9]', np.NaN, regex = True)
df.head()

Unnamed: 0,n,s
0,1.0,a
1,,b
2,3.0,
