# Date Mining Assignment

In [394]:
import numpy as np
import pandas as pd
from IPython.display import display, Markdown, Latex
pd.set_option('max_colwidth', 1000)

import re, os
for d in ["src","data"]: os.makedirs(d, exist_ok=True)

## Load Dataset

In [395]:
URL = "https://kmurphy.bitbucket.io/modules/Data_Mining_2/topics/08-Text_Mining/20-Mining_Dates/files/"

for filename, dest in [("public.csv", "src")]:
    
    source = f"{URL}/{filename}"
    target = f"{dest}/{filename}"

    if not os.path.isfile(target):
        print (f"Downloading remote file {filename} to folder {dest}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename} in folder {dest}")

Using local copy of public.csv in folder src


In [396]:
df = pd.read_csv("src/public.csv")
df.head()

Unnamed: 0,Code,Raw
0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor, Mcalester."
1,1039370009,".April, 5 97: made a phone call to Mom and Mom commented that he was talking very fast, hard to interrupt, but was in super happy spirits, so didn't make a big deal of it."
2,1039574613,"A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue (s/p partial glossectomy and neck dissection in 8/1974) referred to psycho-oncology for assistance with adjustment issues following recovery. The patient does not meet criteria for a major mood or anxiety disorder. She is not at imminent risk of harm to self or others. She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement, and to think through how to continue to create a life for herself moving forward."
3,1039963589,"October 7, 01 [report_end]"
4,1048901075,"July, 4, 01 Primary Care Doctor:"


## Outline of Solution Pipeline

 * First perform a cleaning step. This greatly reduces the number of regular expressions needed.
     - you to figure out specifics here
   
 * Define two helper functions to simply application of regular expression and to display which parts of dataset are matched / not matched.
 * Build a sequence of regular expressions (starting from most restrictive to avoid false positives) to apply to dataset.
 * At each match/iteration use examples of unmatched rows to determine next possible regular expression.

---

## Cleaning Step

As a result of this step, the dataframe will have columns

 * __Code__ unchanged
 * __Raw__ unchanged
 * __Data__ cleaned version of __Raw__
 * __Iter__ number of regex that matched this row (for development purposes)
 * __Match__ regex object result for this row (for development purposes, you might not use)
 * __Day__ day (or zero if not set)
 * __Month__ month (or zero if not set)
 * __Year__ year (or zero if not set)

### 1. Removing uneeded characters

In [408]:
df['Data'] = df["Raw"].replace({'\(':'', '\)':'', '\.':'', '\,':'', '\:':'', '\+':'', '\s+':' '}, regex=True)
df["Iter"] = 0        
df["Match"] = 0     
df["Day"] = 0
df["Month"] = 0
df["Year"] = 0

df.head()

Unnamed: 0,Code,Raw,Data,Iter,Match,Day,Month,Year
0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor, Mcalester.",12 Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor Mcalester,0,0,0,0,0
1,1039370009,".April, 5 97: made a phone call to Mom and Mom commented that he was talking very fast, hard to interrupt, but was in super happy spirits, so didn't make a big deal of it.",April 5 97 made a phone call to Mom and Mom commented that he was talking very fast hard to interrupt but was in super happy spirits so didn't make a big deal of it,0,0,0,0,0
2,1039574613,"A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue (s/p partial glossectomy and neck dissection in 8/1974) referred to psycho-oncology for assistance with adjustment issues following recovery. The patient does not meet criteria for a major mood or anxiety disorder. She is not at imminent risk of harm to self or others. She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement, and to think through how to continue to create a life for herself moving forward.",A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue s/p partial glossectomy and neck dissection in 8/1974 referred to psycho-oncology for assistance with adjustment issues following recovery The patient does not meet criteria for a major mood or anxiety disorder She is not at imminent risk of harm to self or others She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement and to think through how to continue to create a life for herself moving forward,0,0,0,0,0
3,1039963589,"October 7, 01 [report_end]",October 7 01 [report_end],0,0,0,0,0
4,1048901075,"July, 4, 01 Primary Care Doctor:",July 4 01 Primary Care Doctor,0,0,0,0,0


### 2. Fixing month typos

In [409]:
from fuzzywuzzy import fuzz

months = ["january", "february", "march", "april", "june", "july", "august", "september", "october", "november", "december"]

for index, line in df['Data'].items():
    for word in line.split():
        for month in months:
            ratio = fuzz.ratio(word.lower(), month)
            if ratio > 90  and word.lower() != month:
                print("Match between: {0} - {1}, {2}%".format(word.lower(), month, ratio))
                df['Data'] = df['Data'].replace(line, line.replace(word, month))


df.head(20)

Match between: noember - november, 93%
Match between: septeber - september, 94%
Match between: sepember - september, 94%
Match between: sptember - september, 94%
Match between: janury - january, 92%
Match between: augst - august, 91%
Match between: deember - december, 93%
Match between: sptember - september, 94%
Match between: septmber - september, 94%
Match between: noember - november, 93%
Match between: decemer - december, 93%
Match between: septemer - september, 94%
Match between: augst - august, 91%
Match between: sepember - september, 94%
Match between: deember - december, 93%
Match between: jauary - january, 92%
Match between: auust - august, 91%
Match between: setember - september, 94%
Match between: otober - october, 92%
Match between: sptember - september, 94%
Match between: agust - august, 91%
Match between: febuary - february, 93%
Match between: septmber - september, 94%
Match between: decmber - december, 93%
Match between: febrary - february, 93%
Match between: janary - jan

Unnamed: 0,Code,Raw,Data,Iter,Match,Day,Month,Year
0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor, Mcalester.",12 november 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor Mcalester,0,0,0,0,0
1,1039370009,".April, 5 97: made a phone call to Mom and Mom commented that he was talking very fast, hard to interrupt, but was in super happy spirits, so didn't make a big deal of it.",April 5 97 made a phone call to Mom and Mom commented that he was talking very fast hard to interrupt but was in super happy spirits so didn't make a big deal of it,0,0,0,0,0
2,1039574613,"A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue (s/p partial glossectomy and neck dissection in 8/1974) referred to psycho-oncology for assistance with adjustment issues following recovery. The patient does not meet criteria for a major mood or anxiety disorder. She is not at imminent risk of harm to self or others. She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement, and to think through how to continue to create a life for herself moving forward.",A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue s/p partial glossectomy and neck dissection in 8/1974 referred to psycho-oncology for assistance with adjustment issues following recovery The patient does not meet criteria for a major mood or anxiety disorder She is not at imminent risk of harm to self or others She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement and to think through how to continue to create a life for herself moving forward,0,0,0,0,0
3,1039963589,"October 7, 01 [report_end]",October 7 01 [report_end],0,0,0,0,0
4,1048901075,"July, 4, 01 Primary Care Doctor:",July 4 01 Primary Care Doctor,0,0,0,0,0
5,1054311047,)and 8mo in 2009,and 8mo in 2009,0,0,0,0,0
6,1054668034,")HTN, hypercholesterolemia, DM, sleep apnea,, nephrolithiasis. chronic renal impairment, DVT since July 1977 on enoxaparin.",HTN hypercholesterolemia DM sleep apnea nephrolithiasis chronic renal impairment DVT since July 1977 on enoxaparin,0,0,0,0,0
7,1082469285,"Septeber, 10, 70 CPT Code: 90792: With medical services",september 10 70 CPT Code 90792 With medical services,0,0,0,0,0
8,1125769793,"Since 10/2014: Fatigued, more forgetful, impaired dexterity on her left hand. MRI reveals an approximately 4.2cm x 3.3cm x 2.5cm right parietal enhancing mass with surrounding edema",Since 10/2014 Fatigued more forgetful impaired dexterity on her left hand MRI reveals an approximately 42cm x 33cm x 25cm right parietal enhancing mass with surrounding edema,0,0,0,0,0
9,1148116416,24 yo right handed woman with history of large right frontal mass s/p resection 11/3/1985 who had recent urgent R cranial wound revision and placement of L EVD for declining vision and increased drainage from craniotomy incision site and possible infection. She has a hx of secondary mania related to psychosis and manipulation of her right frontal lobe.,24 yo right handed woman with history of large right frontal mass s/p resection 11/3/1985 who had recent urgent R cranial wound revision and placement of L EVD for declining vision and increased drainage from craniotomy incision site and possible infection She has a hx of secondary mania related to psychosis and manipulation of her right frontal lobe,0,0,0,0,0


### 2. Adding Leading Zeroes

In [415]:
for index, line in df['Data'].items():
    for word in line.split():
        
        # if single digit exists
        if word.isnumeric() and len(word) == 1:
            formatted = line.replace(word, "0" + word)
            df['Data'] = df['Data'].replace(line, formatted)
        
        # if single digit exists within forward slash
        if len(word.split('/')) > 1:
            for i in word.split('/'):
                if i.isnumeric() and len(i) == 1:
                    formatted = '/'.join([item.zfill(2) for item in (word.split('/'))])
                    df['Data'] = df['Data'].replace(line, formatted)

Unnamed: 0,Code,Raw,Data,Iter,Match,Day,Month,Year
0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor, Mcalester.",12 november 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor Mcalester,0,0,0,0,0
1,1039370009,".April, 5 97: made a phone call to Mom and Mom commented that he was talking very fast, hard to interrupt, but was in super happy spirits, so didn't make a big deal of it.",April 05 97 made a phone call to Mom and Mom commented that he was talking very fast hard to interrupt but was in super happy spirits so didn't make a big deal of it,0,0,0,0,0
2,1039574613,"A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue (s/p partial glossectomy and neck dissection in 8/1974) referred to psycho-oncology for assistance with adjustment issues following recovery. The patient does not meet criteria for a major mood or anxiety disorder. She is not at imminent risk of harm to self or others. She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement, and to think through how to continue to create a life for herself moving forward.",08/1974,0,0,0,0,0
3,1039963589,"October 7, 01 [report_end]",October 07 01 [report_end],0,0,0,0,0
4,1048901075,"July, 4, 01 Primary Care Doctor:",July 04 01 Primary Care Doctor,0,0,0,0,0


### 3. Convert Month Abbreviations

In [411]:
months = {
    "jan": "january", 
    "feb": "february", 
    "mar": "march", 
    "apr": "april", 
    "jun": "june", 
    "jul": "july", 
    "aug": "august", 
    "sep": "september",
    "oct": "october",
    "nov": "november", 
    "dec": "december"
}

for index, line in df['Data'].items():
    for word in line.split():
          for key, val in months.items(): 
            if word.lower() == key:
                print(line.replace(word, val))
                df['Data'] = df['Data'].replace(line, line.replace(word, val))
                
df.head()

## Helper Functions

In [412]:
def info(n=None, unmatched=True, head=5):
    "Helper function to display results of a match and some rows not matched so far."
    
    n = df.Match.max() if n is None else n
    for nn in ([n,0] if n!=0 and unmatched else [n]):
        display(Markdown("**Numer of rows with iter=%s: %s**" % (nn,len(df[df.Iter==nn]))))
        display(df.loc[df.Iter==nn,["Code","Data","Iter","Match", "Day","Month","Year"]].head(head))
info()

**Numer of rows with iter=0: 715**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
0,1012720972,12 november 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor Mcalester,0,0,0,0,0
1,1039370009,April 05 97 made a phone call to Mom and Mom commented that he was talking very fast hard to interrupt but was in super happy spirits so didn't make a big deal of it,0,0,0,0,0
2,1039574613,08/1974,0,0,0,0,0
3,1039963589,October 07 01 [report_end],0,0,0,0,0
4,1048901075,July 04 01 Primary Care Doctor,0,0,0,0,0


In [413]:
def verify_dates(df):
    # you TODO (or skip if you don't need it)
    return df

----
## Sequence of Regex's 

Feel free to change what I am doing here. The structure is simple, but adaptable:
 * Construct the regex
     * Pick a row and develop/test against it
 * Apply regex 
     * Set counter __iter__
     * apply to unmatched rows in __df__ to create a second dataframe, __df2__, with columns __Day__,__Month__,__Year__
     * Drop unmatch rows --- identified by NaN.
 * Test matched rows (I'm not giving you this, and you might not need it anyway, depending on your regex)
     * It is possible that the regex matched something that was not a valid date -- you want to remove these rows from __df2__. 
 * Update matched rows in __df__.
     * Save appropriate entry in __Day__, __Month__, __Year__, and __iter__.
     
---
---

### 1 - mm/dd/yyyy

In [367]:
tmp = df.loc[df.Code==2033924723,"Data"]
display(tmp)

# for month, first number can either start with 0 with range from 1-9 OR start with 1 with range from 1-2
regex = r"(?P<Month>0[1-9]|1[1-2]{1,2})/(?P<Day>[\d]{1,2})/(?P<Year>[\d]{2,4})"
tmp.str.extract(regex)

81    11/14/83 Audit C Score Current
Name: Data, dtype: object

Unnamed: 0,Month,Day,Year
81,11,14,83


In [368]:
iter = 1

df2 = df[df.Iter==0].Data.str.extract(regex)
df2.dropna(inplace=True)
display(Markdown(" * Number of row matched = %s" % df2.shape[0]))
display(df2.head())

 * Number of row matched = 103

Unnamed: 0,Month,Day,Year
9,11,3,1985
10,4,19,91
14,7,29,1994
16,6,10,72
18,6,18,85


In [369]:
# save result
df.loc[(df.Iter==0) & (df2.Day.notnull()), "Day"] = df2["Day"]
df.loc[(df.Iter==0) & (df2.Month.notnull()), "Month"] = df2["Month"]
df.loc[(df.Iter==0) & (df2.Year.notnull()), "Year"] = df2["Year"]
df.loc[(df.Iter==0) & (df2.Year.notnull()),"Iter"] = iter

info(iter)

**Numer of rows with iter=1: 103**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
9,1148116416,11/03/1985,1,0,3,11,1985
10,1157934136,04/19/91,1,0,19,4,91
14,1191233809,07/29/1994 CPT code 99203,1,0,29,7,1994
16,1218956332,06/10/72,1,0,10,6,72
18,1220889324,06/18/85,1,0,18,6,85


**Numer of rows with iter=0: 612**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
0,1012720972,12 november 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor Mcalester,0,0,0,0,0
1,1039370009,April 05 97 made a phone call to Mom and Mom commented that he was talking very fast hard to interrupt but was in super happy spirits so didn't make a big deal of it,0,0,0,0,0
2,1039574613,08/1974,0,0,0,0,0
3,1039963589,October 07 01 [report_end],0,0,0,0,0
4,1048901075,July 04 01 Primary Care Doctor,0,0,0,0,0


### 2 - mm/yyyy

In [370]:
tmp = df.loc[df.Code==1657472256,"Data"]
display(tmp)

# for month, first number can either start with 0 with range from 1-9 OR start with 1 with range from 1-2
regex = r"(?P<Month>0[1-9]|1[1-2]{1,2})/(?P<Year>[\d]{2,4})"
tmp.str.extract(regex)

53    02/1973
Name: Data, dtype: object

Unnamed: 0,Month,Year
53,2,1973


In [371]:
iter = 2

df2 = df[df.Iter==0].Data.str.extract(regex)
df2.dropna(inplace=True)
display(Markdown(" * Number of row matched = %s" % df2.shape[0]))
display(df2.head())

 * Number of row matched = 96

Unnamed: 0,Month,Year
2,8,1974
11,3,2003
13,3,1990
35,2,96
44,5,2006


In [372]:
# save result
df.loc[(df.Iter==0) & (df2.Month.notnull()), "Month"] = df2["Month"]
df.loc[(df.Iter==0) & (df2.Year.notnull()), "Year"] = df2["Year"]
df.loc[(df.Iter==0) & (df2.Year.notnull()),"Iter"] = iter

info(iter)

**Numer of rows with iter=2: 96**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
2,1039574613,08/1974,2,0,0,8,1974
11,1162722894,03/2003,2,0,0,3,2003
13,1174525826,kNotice that in 03/1990 sustained a bizarre injury He was in Colorado City at the time He was driving his car and he says he had recently ran out of Saphris which is an antipsychotic he was taking He says he does not recall all the events but believes he stepped out of his vehicle and then walked off of a bridge sustaining a seven-story fall He was found unconscious He was taken to and treated at Norfolk Health Center in Colorado City where he underwent open reduction internal fixation of the right humerus as well as the left femur was in ICU for a week multiple fx He subsequently recovered from his injuries in the state of South Carolina,2,0,0,3,1990
35,1445984744,10/02/96 Age,2,0,0,2,96
44,1583575271,05/2006,2,0,0,5,2006


**Numer of rows with iter=0: 516**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
0,1012720972,12 november 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor Mcalester,0,0,0,0,0
1,1039370009,April 05 97 made a phone call to Mom and Mom commented that he was talking very fast hard to interrupt but was in super happy spirits so didn't make a big deal of it,0,0,0,0,0
3,1039963589,October 07 01 [report_end],0,0,0,0,0
4,1048901075,July 04 01 Primary Care Doctor,0,0,0,0,0
5,1054311047,and 8mo in 2009,0,0,0,0,0


## Exporting

In [373]:
df.to_csv("results.csv", index=False)
from zipfile import ZipFile
archive = "date_assignment.zip"
print(f"Creating archive: {archive}")
with ZipFile(archive,"w") as zip:
    for f in ["01-Model.ipynb", "results.csv"]:
        if os.path.isfile(f):
            print(f"\t{f} - OK")
            zip.write(f) 
        else:
            print(f"\t{f} - Missing. Check this!")

Creating archive: date_assignment.zip
	01-Model.ipynb - OK
	results.csv - OK
