# Regular Expressions


In [470]:
from nose.tools import assert_true, assert_false, \
    assert_almost_equal, assert_equal, assert_raises

In [471]:

import numpy as np
import locale
import os
import re
import pandas as pd
from textblob import TextBlob

In [472]:
DATADIR = os.path.join("..","Resources")
os.path.exists(DATADIR)
#DATADIR = "PATH_TO_WHERE_YOU_KEEP_DATA"

False

**Problem 1 (10 points).** Write a function (`currency2float`) that uses a regular expression to capture and return the **numeric value** of a currency string and its **units.** Note you need to use the **[locale](http://docs.python.org/3/library/locale.html)** module because of the potential of ",." characters in the strings. 

In [473]:
# YOUR CODE HERE
def currencyToFloat(currency):
    """
    Uses a regular expression to capture and return the numeric value of a currency string and its units
    """
    locale.setlocale(locale.LC_ALL)
    RE = re.compile('[\.\,[0-9]+')
    unit = re.split(RE, currency)[0]
    amount = locale.atof(re.search(RE,currency).group(0))
    return (amount, unit)

In [474]:
assert_equal(currencyToFloat("£19,469"), (19469.0, '£'))

In [475]:
assert_equal(currencyToFloat("$1,243,567.32"), (1243567.32, '$'))

**Problem 2 (14 points):** Read the contents of income.txt into a Pandas data. 

Write a second function ``extract_income`` that takes as a positional argumenst a Pandas data frame and a column name from which to extract income. The function should return a new data frame that has columns consisting of the numeric values and units extracted by the ``currency2float`` function. The names of the columns in the new data frame should be derived from the input column (e.g. "per capita income" would create two columns named "per capita income value" and "per capita income unit"). Test this with income.txt and income_uk.txt located in the Resources directory.

#### Hints

1. ``SOME_PANDAS_SERIES.apply(pd.Series)`` will split the series (column) `SOME_PANDAS_SERIES` with tuple values into a data frame with each element of the tuples forming a column in the new dataframe with column names 0, 1, 2, etc. See [this](http://stackoverflow.com/questions/29550414/how-to-split-column-of-tuples-in-pandas-dataframe) Stackoverflow question/answer.
1. Use the [``rename``](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rename.html) method to rename the columns of the generated DataFrame.
1. Pay attention to the presence/absence of headers. If the file does not have a header, use ``rename`` to provide meaningful column names.
1. Since strings are unicode, you can past a British pound symbol into your regular expression.
1. Use the ``encoding`` keyword argument for ``read_table``.

### Your solutions should look something like this:

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Income value</th>
      <th>Income unit</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>21109.0</td>
      <td>£</td>
    </tr>
    <tr>
      <th>1</th>
      <td>19603.0</td>
      <td>£</td>
    </tr>
    <tr>
      <th>2</th>
      <td>19469.0</td>
      <td>£</td>
    </tr>
    <tr>
      <th>3</th>
      <td>19282.0</td>
      <td>£</td>
    </tr>
    <tr>
      <th>4</th>
      <td>19236.0</td>
      <td>£</td>
    </tr>
  </tbody>
</table>

In [476]:
# YOUR CODE HERE
def extract_income(df, col):
    """
    returns a new data frame that has columns consisting of the numeric values and units extracted by the currencyToFloat function.
    """
    df['tuple'] = df.apply (lambda row: currencyToFloat(row[col]), axis=1)
    return df['tuple'].apply(pd.Series).rename(columns={0:col + " value", 1:col + " unit"})

In [477]:
income = pd.read_table("./income.txt")
income_uk = pd.read_table("./income_uk.txt",
                         header = None,
                         encoding='utf-8').rename(columns={0:"Region", 
                                                        1:"Income"})
extract_income(income_uk, "Income")

Unnamed: 0,Income value,Income unit
0,21109.0,£
1,19603.0,£
2,19469.0,£
3,19282.0,£
4,19236.0,£
5,18801.0,£
6,18629.0,£
7,18614.0,£
8,18321.0,£
9,17651.0,£


In [478]:
income = pd.read_table("./income.txt")
income_uk = pd.read_table("./income_uk.txt",
                         header = None,
                         encoding='utf-8').rename(columns={0:"Region", 
                                                        1:"Income"})
row = extract_income(income_uk, "Income").iloc[5]
assert_almost_equal(row["Income value"],18801)
assert_equal(row["Income unit"],"£")

In [479]:
income = pd.read_table("./income.txt")
income_uk = pd.read_table("./income_uk.txt",
                         header = None,
                         encoding='utf-8').rename(columns={0:"Region", 
                                                        1:"Income"})
row = extract_income(income, "median household income").iloc[5]
assert_almost_equal(row["median household income value"],63636)
assert_equal(row["median household income unit"],"$")

**Problem 3 (20 Points): **

Write a regular expression (`regname`) to match physician names in reports. You can assume that any name in the text is either preceded by "Dr." or followed by "MD" (or "M.D.", "D.O.", "DO"). Use the following named groups:

* "prefix"
* "fname"
* "mname"
* "lname"
* "suffix"

Match at least the following name variations:
    * Dr. Vivian Lee
    * Dr. Vivian S Lee
    * Vivian Lee, MD
    * Vivian S Lee, MD
    
#### Hints

1. Be careful in how you deal with white spaces. Regular expressions are a balancing act between sensitivity and specificity.
1. What parts in the names are optional?

In [480]:
regname = None
# YOUR CODE HERE
regname = re.compile('(?P<prefix>\Dr\.\s)?(?P<fname>[A-Z][a-z]*\s)(?P<mname>[A-Z][a-z]*\.?\s)?(?P<lname>[A-HJ-Z][a-z]*)(?P<suffix>,?\s?[A-Z]?[a-z]?\.?[A-Z])?')

In [481]:
tst = \
"""
My physician is Dr. Barry Stults while my mentor is 
Dr. Matthew H Samore.
Vivian Lee, MD hired me a few years after she was hired by
A Lorris Betz, MD.
David W. Pershing, Ph.D. is the university president and a lot of people hope that Ruth V Watkins, PhD is hired as the next
univeersity president. When I was in Pittsburgh I worked with David Gur, ScD and Dr. Jules Henry Sumkin. My favorite
 philosopher is Alfred North Whitehead, Ph.D.


"""
matches = list(regname.finditer(tst))
m = matches[0]
assert_equal(m.group("prefix").strip(), "Dr.")
assert_equal(m.group("fname").strip(), "Barry")
assert_equal(m.group('mname'), None)
assert_equal(m.group("lname").strip(), "Stults")
assert_equal(m.group("suffix"), None)

In [482]:
tst = \
"""
My physician is Dr. Barry Stults while my mentor is 
Dr. Matthew H Samore.
Vivian Lee, MD hired me a few years after she was hired by
A Lorris Betz, MD.
David W. Pershing, Ph.D. is Dr. Betz's boss and a lot of people hope that Ruth V Watkins, PhD is hired as the next
univeersity president. When I was in Pittsburgh I worked with David Gur, ScD and Dr. Jules Henry Sumkin. My favorite
 philosopher is Alfred North Whitehead, Ph.D.


"""
matches = list(regname.finditer(tst))
m = matches[1]
assert_equal(m.group("prefix").strip(), "Dr.")
assert_equal(m.group("fname").strip(), "Matthew")
assert_equal(m.group('mname').strip(), "H")
assert_equal(m.group("lname").strip(), "Samore")
assert_equal(m.group("suffix"), None)

In [483]:
tst = \
"""
My physician is Dr. Barry Stults while my mentor is 
Dr. Matthew H Samore.
Vivian Lee, MD hired me a few years after she was hired by
A Lorris Betz, MD.
David W. Pershing, Ph.D. is the university president and a lot of people hope that Ruth V Watkins, PhD is hired as the next
univeersity president. When I was in Pittsburgh I worked with David Gur, ScD and Dr. Jules Henry Sumkin. My favorite
 philosopher is Alfred North Whitehead, Ph.D.


"""
matches = list(regname.finditer(tst))
m = matches[2]
assert_equal(m.group("prefix"), None)
assert_equal(m.group("fname").strip(), "Vivian")
assert_equal(m.group('mname'), None)
assert_equal(m.group("lname").strip(), "Lee")
assert_equal(m.group("suffix").strip(), ", MD")


In [484]:
tst = \
"""
My physician is Dr. Barry Stults while my mentor is 
Dr. Matthew H Samore.
Vivian Lee, MD hired me a few years after she was hired by
A Lorris Betz, MD.
David W. Pershing, Ph.D. is the university president and a lot of people hope that Ruth V Watkins, PhD is hired as the next
univeersity president. When I was in Pittsburgh I worked with David Gur, ScD and Dr. Jules Henry Sumkin. My favorite
 philosopher is Alfred North Whitehead, Ph.D.


"""
matches = list(regname.finditer(tst))
m = matches[6]
assert_equal(m.group("prefix"), None)
assert_equal(m.group("fname").strip(), "David")
assert_equal(m.group('mname'), None)
assert_equal(m.group("lname").strip(), "Gur")
assert_equal(m.group("suffix").strip(), ", ScD")


**Problem 4 (24 points):**

* Write a regular expression to match dates in the following form:
    * 2011-03-13
    

* Write a second regular expression to match dates in the following formats:
    * 01/28/1947
    * 03/17/75
    

* Write a third regular expression to match dates in the following formats.
    * January 18, 2015
    * Jan 1931
    * Jan 8, 47
    

Put all the regular expressions in a list named `redates`. 

In [485]:
import re

redates = []
# YOUR CODE HERE

#2011-03-13
redates.append(re.compile('(?P<year>\d+)-(?P<month>\d+)-(?P<day>\d+)'))

#01/28/1947
#03/17/75
redates.append(re.compile('(?P<month>\d+)/(?P<day>\d+)/(?P<year>\d+)'))

#January 18, 2015
#Jan 1931
#Jan 8, 47
redates.append(re.compile('(?P<month>[A-Z][a-z]+)\s(?P<day>\d+?\,)?\s?(?P<year>\d+)'))

In [486]:
tst = \
"""
    * 2011-03-13
    * 01/08/1947
    * 03/01/75
    * January 18, 2015
    * Jan 1931
    * Jan 8, 47
"""
matches = [m for r in redates for m in list(r.finditer(tst)) ]
m = matches[0]
assert_equal(m.group("year"), "2011")
assert_equal(m.group("month"), "03")
assert_equal(m.group("day"), "13")

In [487]:
tst = \
"""
    * 2011-03-13
    * 01/08/1947
    * 03/01/75
    * January 18, 2015
    * Jan 1931
    * Jan 8, 47
"""
matches = [m for r in redates for m in list(r.finditer(tst)) ]
m = matches[1]
assert_equal(m.group("year"), "1947")
assert_equal(m.group("month"), "01")
assert_equal(m.group("day"), "08")

In [488]:
tst = \
"""
    * 2011-03-13
    * 01/08/1947
    * 03/01/75
    * January 18, 2015
    * Jan 1931
    * Jan 8, 47
"""
matches = [m for r in redates for m in list(r.finditer(tst)) ]
m = matches[3]
assert_equal(m.group("year"), "2015")
assert_equal(m.group("month"), "January")
assert_equal(m.group("day").strip(), "18,")

In [489]:
tst = \
"""
    * 2011-03-13
    * 01/08/1947
    * 03/01/75
    * January 18, 2015
    * Jan 1931
    * Jan 8, 47
"""
matches = [m for r in redates for m in list(r.finditer(tst)) ]
m = matches[4]
assert_equal(m.group("year"), "1931")
assert_equal(m.group("month"), "Jan")
assert_equal(m.group("day"), None)