# Text Wrangling and Regex
Working with text: applying string methods and regular expressions

In [169]:
# from google.colab import drive
# drive.mount('/content/drive')

In [170]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import zipfile
import pandas as pd

## Demo 1: Canonicalizing County Names

In [171]:
# read both states and population csv files
states = pd.read_csv('datafiles/county_and_state.csv')
populations = pd.read_csv('datafiles/county_and_population.csv')

# display allows us to view a DataFrame without returning it as an object
display(states)
display(populations)

Unnamed: 0,County,State
0,De Witt County,IL
1,Lac qui Parle County,MN
2,Lewis and Clark County,MT
3,St John the Baptist Parish,LS


Unnamed: 0,County,Population
0,DeWitt,16798
1,Lac Qui Parle,8067
2,Lewis & Clark,55716
3,St. John the Baptist,43044


Both of these DataFrames share a "County" column. Unfortunately, formatting differences mean that we can't directly merge the two DataFrames using the "County"s.

In [172]:
# apply merage
states.merge(populations, left_on= 'County', right_on= 'County')

Unnamed: 0,County,State,Population


To address this, we can **canonicalize** the "County" string data to apply a common formatting.

In [173]:
# define some str operation need to perform on County col in each dataframe
def canonicalize_county(county_series):
    return (county_series
    .str.lower()
    .str.replace(' ', '') 
    .str.replace('&', 'and')
    .str.replace('.', '')
    .str.replace('county', '') 
    .str.replace('parish', ''))

In [174]:
# apply that customized function
states['County'] = canonicalize_county(states['County'])
populations['County'] = canonicalize_county(states['County'])

In [175]:
display(states)
display(populations)

Unnamed: 0,County,State
0,dewitt,IL
1,lacquiparle,MN
2,lewisandclark,MT
3,stjohnthebaptist,LS


Unnamed: 0,County,Population
0,dewitt,16798
1,lacquiparle,8067
2,lewisandclark,55716
3,stjohnthebaptist,43044


Now, the merge works as expected!

In [176]:
# merage both frames
states.merge(populations, left_on= 'County', right_on= 'County')

Unnamed: 0,County,State,Population
0,dewitt,IL,16798
1,lacquiparle,MN,8067
2,lewisandclark,MT,55716
3,stjohnthebaptist,LS,43044


## Demo 2: Extracting Log Data

In [177]:
# with open('datafiles/log.txt', 'r') as file:
#     # logfile = file.read()  # for storing the data to file
#     logdata = []
#     for line in file:
#         # print(line.strip())
#         logdata.append(line.strip()) # to have strings in list

# # print(logfile)
# logdata

In [178]:
# another method wothout using forloop
with open('datafiles/log.txt', 'r') as file:
    logdata = file.read().splitlines()
print(logdata)

['169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"', '193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/session.html"', '169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800] "GET /stat141/homework/Solutions/hw1Sol.pdf HTTP/1.1"']


Suppose we want to extract the day, month, year, hour, minutes, seconds, and timezone. Looking at the data, we see that these items are not in a fixed position relative to the beginning of the string. That is, slicing by some fixed offset isn't going to work.

In [179]:
#for first line
logdata[0].find('[') #19
logdata[0].find('/') #46

logdata[0][20:46]

'26/Jan/2014:10:47:58 -0800'

In [180]:
#for second line #n can't be extracted correct data, we have to find indicies for every entry
logdata[1][20:46]

'/Feb/2005:17:23:6 -0800] "'

Instead, we'll need to use some more sophisticated thinking. Let's focus on only the first line of the file.

In [181]:
# read 1st line only
logdata[0]

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"'

In [None]:
# apply string operations and print day, month, year, and time

                                              # find the text enclosed in square brackets
                                              # split up the date/month/year
                                              # split up the hour:minute:second
                                              # split the timezone after the blank space
# day, month, year, hour, minute, seconds, time_zone

('26', 'Jan', '2014', '10', '47', '58', '-0800')

In [264]:
# # for single entry
# day = logdata[0].split('[')[1].split('/')[0]
# month = logdata[0].split('/')[1]
# rest = logdata[0].split('/')[2]
# year, hour, minute, rest = logdata[0].split('/')[2].split(':')
# seconds = logdata[0].split('/')[2].split(':')[3].split(' -')[0]
# timezone = logdata[0].split('/')[2].split(':')[3].split(' -')[1].split(']')[0]

# day, month, year , hour, minute, seconds, timezone

In [262]:
# Defining a Function
# For strings
# apply string operations and print day, month, year, and time
def get_time_info(logdata):
    day = logdata.split('[')[1].split('/')[0]
    month = logdata.split('/')[1]
    rest = logdata.split('/')[2]
    year, hour, minute, rest = logdata.split('/')[2].split(':')
    seconds = logdata.split('/')[2].split(':')[3].split(' ')[0]
    timezone = logdata.split('/')[2].split(':')[3].split(' ')[1].split(']')[0]

    return day, month, year , hour, minute, seconds, timezone

In [265]:
get_time_info(logdata[0])

('26', 'Jan', '2014', '10', '47', '58', '-0800')

In [268]:
log_time_info = []
for i in logdata:
    log_time_info.append(get_time_info(i))

log_time_info

[('26', 'Jan', '2014', '10', '47', '58', '-0800'),
 ('2', 'Feb', '2005', '17', '23', '6', '-0800'),
 ('3', 'Feb', '2006', '10', '18', '37', '-0800')]

In [259]:
# For dataframe and series:
# apply string operations and print day, month, year, and time
# def get_time_info(logdata):
#     day = logdata.str.split('[')[1].split('/')[0]
#     month = logdata.str.split('/')[1]
#     rest = logdata.str.split('/')[2]
#     year, hour, minute, rest = logdata.str.split('/')[2].split(':')
#     seconds = logdata.str.split('/')[2].split(':')[3].split(' ')[0]
#     timezone = logdata.str.split('/')[2].split(':')[3].split(' ')[1].split(']')[0]

#     return day, month, year , hour, minute, seconds, timezone


In [258]:
# # logdata
# log_df = pd.DataFrame({'Logs': logdata})
# log_df
# # log_df['Logs'] = get_time_info(log_df['Logs'])


In [257]:
# log_df['Logs'].str.split('[')[1]
# log_df['Logs'].str.split('[')[0].str.split(':')

This worked, but felt fairly "hacky" – the code above isn't particularly elegant. A much more sophisticated but common approach is to extract the information we need using a *regular expression*.


# Regular Expressions


## String Extraction with Regex

Python `re.findall` returns a list of all extracted matches:

In [270]:
# find out all social sequrity numbers from text below
import re
text = "My social security number is 123-45-6789 bro, or actually maybe it's 321-45-6789."


<br/>

Now, let's see vectorized extraction in `pandas`:

 `.str.findall` returns a `Series` of lists of all matches in each record.

In [None]:
# convert the ['987-65-4321', 'forty', '123-45-6789 bro or 321-45-6789', '999-99-9999']
# into data frame and extract all Social sequrity numbers


df_ssn

Unnamed: 0,SSN
0,987-65-4321
1,forty
2,123-45-6789 bro or 321-45-6789
3,999-99-9999


In [None]:
# -> Series of lists



# Apply str.findall() and remove empty strings from the result


df_ssn

Unnamed: 0,SSN
0,[987-65-4321]
1,[]
2,"[123-45-6789, 321-45-6789]"
3,[999-99-9999]


In [None]:
# find out AA or B from 'AA B B aaaabbbb'


['AA', 'B', 'B']

## Extraction Using Regex Capture Groups

The Python function `re.findall`, in combination with parentheses returns specific substrings (i.e., **capture groups**) within each matched string, or **match**.

In [None]:
# find out hour, mint and second
import re
text = """I will meet you at 08:30:00 pm tomorrow"""


[('08', '30', '00')]

<br/>

In `pandas`, we can use `.str.extract` to extract each capture group of **only the first match** of each record into separate columns.

In [None]:
# back to SSNs
df_ssn

Unnamed: 0,SSN
0,987-65-4321
1,forty
2,123-45-6789 bro or 321-45-6789
3,999-99-9999


In [None]:
# Will extract the first match of all groups
 # 3 groups


Unnamed: 0,0,1,2
0,987.0,65.0,4321.0
1,,,
2,123.0,45.0,6789.0
3,999.0,99.0,9999.0


Alternatively, `.str.extractall` extracts **all matches** of each record into separate columns. Rows are then MultiIndexed by original record index and match index.

In [None]:
# -> DataFrame, one row per match


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,987,65,4321
2,0,123,45,6789
2,1,321,45,6789
3,0,999,99,9999


## Canonicalization with Regex

In regular Python, canonicalize with `re.sub` (standing for "substitute"):

In [None]:
# find out Moo from given text
text = '<div><td valign="top">Moo</td></div>'


'Moo'

<br/>

In `pandas`, canonicalize with `Series.str.replace`.

In [None]:
# example dataframe of strings
df_html = pd.DataFrame(['<div><td valign="top">Moo</td></div>',
                   '<a href="http://ds100.org">Link</a>',
                   '<b>Bold text</b>'], columns=['Html'])
df_html

Unnamed: 0,Html
0,"<div><td valign=""top"">Moo</td></div>"
1,"<a href=""http://ds100.org"">Link</a>"
2,<b>Bold text</b>


In [None]:
# find out Moo, Link, and Bold text from each row of df_html dataframe


Unnamed: 0,Html
0,Moo
1,Link
2,Bold text



# Revisiting Text Log Processing using Regex

### Python `re` version

In [None]:
line = log_lines[0]
display(line)

pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
day, month, year, hour, minute, second, time_zone = re.findall(pattern, line)[0] # get first match
day, month, year, hour, minute, second, time_zone

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'

('26', 'Jan', '2014', '10', '47', '58', '-0800')

### `pandas` version

In [None]:
# convert lines of above provided log.txt file into data frame and then find out date and time

Unnamed: 0,Log
0,169.237.46.168 - - [26/Jan/2014:10:47:58 -0800...
1,"193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] ""..."
2,"169.237.46.240 - """" [3/Feb/2006:10:18:37 -0800..."


Option 1: `Series.str.findall`

0    [(26, Jan, 2014, 10, 47, 58, -0800)]
1      [(2, Feb, 2005, 17, 23, 6, -0800)]
2     [(3, Feb, 2006, 10, 18, 37, -0800)]
Name: Log, dtype: object

<br/>

Option 2: `Series.str.extractall`

In [None]:
# apply extractall function and then some wrangling

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,26,Jan,2014,10,47,58,-800
1,0,2,Feb,2005,17,23,6,-800
2,0,3,Feb,2006,10,18,37,-800


Wrangling either of these two DataFrames into a nice format (like below) is left as an exercise for you! You will do a related problem on the homework.


||Day|Month|Year|Hour|Minute|Second|Time Zone|
|---|---|---|---|---|---|---|---|
|0|26|Jan|2014|10|47|58|-0800|
|1|2|Feb|2005|17|23|6|-0800|
|2|3|Feb|2006|10|18|37|-0800|


In [None]:
# your code here