# Date Mining Assignment

In [13]:
import numpy as np
import pandas as pd
from IPython.display import display, Markdown, Latex
pd.set_option('max_colwidth', 1000)

import re, os
for d in ["src","data"]: os.makedirs(d, exist_ok=True)

## Load Dataset

In [14]:
URL = "https://kmurphy.bitbucket.io/modules/Data_Mining_2/topics/08-Text_Mining/20-Mining_Dates/files/"

for filename, dest in [("public.csv", "src")]:
    
    source = f"{URL}/{filename}"
    target = f"{dest}/{filename}"

    if not os.path.isfile(target):
        print (f"Downloading remote file {filename} to folder {dest}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename} in folder {dest}")

Using local copy of public.csv in folder src


In [15]:
df = pd.read_csv("src/public.csv")
df.head()

Unnamed: 0,Code,Raw
0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor, Mcalester."
1,1039370009,".April, 5 97: made a phone call to Mom and Mom commented that he was talking very fast, hard to interrupt, but was in super happy spirits, so didn't make a big deal of it."
2,1039574613,"A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue (s/p partial glossectomy and neck dissection in 8/1974) referred to psycho-oncology for assistance with adjustment issues following recovery. The patient does not meet criteria for a major mood or anxiety disorder. She is not at imminent risk of harm to self or others. She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement, and to think through how to continue to create a life for herself moving forward."
3,1039963589,"October 7, 01 [report_end]"
4,1048901075,"July, 4, 01 Primary Care Doctor:"


## Outline of Solution Pipeline

 * First perform a cleaning step. This greatly reduces the number of regular expressions needed.
     - you to figure out specifics here
   
 * Define two helper functions to simply application of regular expression and to display which parts of dataset are matched / not matched.
 * Build a sequence of regular expressions (starting from most restrictive to avoid false positives) to apply to dataset.
 * At each match/iteration use examples of unmatched rows to determine next possible regular expression.

---

## Cleaning Step

As a result of this step, the dataframe will have columns

 * __Code__ unchanged
 * __Raw__ unchanged
 * __Data__ cleaned version of __Raw__
 * __Iter__ number of regex that matched this row (for development purposes)
 * __Match__ regex object result for this row (for development purposes, you might not use)
 * __Day__ day (or zero if not set)
 * __Month__ month (or zero if not set)
 * __Year__ year (or zero if not set)

In [22]:
df['Data'] = df["Raw"].replace({'\(':'', '\)':'', '\.':'', '\,':'', '\:':'', '\+':'', '\s+':' ', '-':' '}, regex=True)
df["Iter"] = 0        
df["Match"] = 0     
df["Day"] = 0
df["Month"] = 0
df["Year"] = 0

df.head()

Unnamed: 0,Code,Raw,Data,Iter,Match,Day,Month,Year
0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor, Mcalester.",12 Noember 16 bad reaction to SpiceK2 synthetic MJ admitted to Crete Manor Mcalester,0,0,0,0,0
1,1039370009,".April, 5 97: made a phone call to Mom and Mom commented that he was talking very fast, hard to interrupt, but was in super happy spirits, so didn't make a big deal of it.",April 5 97 made a phone call to Mom and Mom commented that he was talking very fast hard to interrupt but was in super happy spirits so didn't make a big deal of it,0,0,0,0,0
2,1039574613,"A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue (s/p partial glossectomy and neck dissection in 8/1974) referred to psycho-oncology for assistance with adjustment issues following recovery. The patient does not meet criteria for a major mood or anxiety disorder. She is not at imminent risk of harm to self or others. She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement, and to think through how to continue to create a life for herself moving forward.",A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue s/p partial glossectomy and neck dissection in 8/1974 referred to psycho oncology for assistance with adjustment issues following recovery The patient does not meet criteria for a major mood or anxiety disorder She is not at imminent risk of harm to self or others She would benefit from psychotherapy to help her integrate her experience of cancer and the break up of her engagement and to think through how to continue to create a life for herself moving forward,0,0,0,0,0
3,1039963589,"October 7, 01 [report_end]",October 7 01 [report_end],0,0,0,0,0
4,1048901075,"July, 4, 01 Primary Care Doctor:",July 4 01 Primary Care Doctor,0,0,0,0,0


In [28]:
from fuzzywuzzy import fuzz

months = ["january", "february", "march", "april", "june", "july", "august", "sepetember", "october", "november", "december"]

for line in df['Data'].head(2):
    for word in line.split():
        for month in months:
            ratio = fuzz.ratio(word.lower(), month)
            if ratio > 90:
                print("\nMatch between: {0} - {1}, {2}%".format(word, month, ratio))

COMPARING 12 with january
COMPARING 12 with february
COMPARING 12 with march
COMPARING 12 with april
COMPARING 12 with june
COMPARING 12 with july
COMPARING 12 with august
COMPARING 12 with sepetember
COMPARING 12 with october
COMPARING 12 with november
COMPARING 12 with december
COMPARING Noember with january
COMPARING Noember with february
COMPARING Noember with march
COMPARING Noember with april
COMPARING Noember with june
COMPARING Noember with july
COMPARING Noember with august
COMPARING Noember with sepetember
COMPARING Noember with october
COMPARING Noember with november
RATIO Noember november
COMPARING Noember with december
COMPARING 16 with january
COMPARING 16 with february
COMPARING 16 with march
COMPARING 16 with april
COMPARING 16 with june
COMPARING 16 with july
COMPARING 16 with august
COMPARING 16 with sepetember
COMPARING 16 with october
COMPARING 16 with november
COMPARING 16 with december
COMPARING bad with january
COMPARING bad with february
COMPARING bad with march

## Helper Functions

In [7]:
def info(n=None, unmatched=True, head=5):
    "Helper function to display results of a match and some rows not matched so far."
    
    n = df.Match.max() if n is None else n
    for nn in ([n,0] if n!=0 and unmatched else [n]):
        display(Markdown("**Numer of rows with iter=%s: %s**" % (nn,len(df[df.Iter==nn]))))
        display(df.loc[df.Iter==nn,["Code","Data","Iter","Match", "Day","Month","Year"]].head(head))
info()

**Numer of rows with iter=0: 715**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
0,1012720972,12 Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor Mcalester,0,0,0,0,0
1,1039370009,April 5 97 made a phone call to Mom and Mom commented that he was talking very fast hard to interrupt but was in super happy spirits so didn't make a big deal of it,0,0,0,0,0
2,1039574613,A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue s/p partial glossectomy and neck dissection in 8/1974 referred to psycho-oncology for assistance with adjustment issues following recovery The patient does not meet criteria for a major mood or anxiety disorder She is not at imminent risk of harm to self or others She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement and to think through how to continue to create a life for herself moving forward,0,0,0,0,0
3,1039963589,October 7 01 [report_end],0,0,0,0,0
4,1048901075,July 4 01 Primary Care Doctor,0,0,0,0,0


In [None]:
def verify_dates(df):
    # you TODO (or skip if you don't need it)
    return df

----
## Sequence of Regex's 

Feel free to change what I am doing here. The structure is simple, but adaptable:
 * Construct the regex
     * Pick a row and develop/test against it
 * Apply regex 
     * Set counter __iter__
     * apply to unmatched rows in __df__ to create a second dataframe, __df2__, with columns __Day__,__Month__,__Year__
     * Drop unmatch rows --- identified by NaN.
 * Test matched rows (I'm not giving you this, and you might not need it anyway, depending on your regex)
     * It is possible that the regex matched something that was not a valid date -- you want to remove these rows from __df2__. 
 * Update matched rows in __df__.
     * Save appropriate entry in __Day__, __Month__, __Year__, and __iter__.
     
---
---

### 1 - mm/dd/yyyy 

In [None]:
# Construct the regex
tmp = df.loc[df.Code==10430271,"Data"]
display(tmp)

regex = r"([\d]{1,2})/([\d]{1,2})/(\d{2,4})"
#tmp.str.extract(regex)

In [None]:
# apply regex

iter = 1

df2 = df[df.Iter==0].Data.str.extract(regex)
df2.dropna(inplace=True)
display(Markdown(" * Number of row matched = %s" % df2.shape[0]))
display(df2.head())

In [None]:
# test result
df2 = verify_dates(df2)

In [None]:
# save result

df.loc[(df.Iter==0) & (df2.Year), "Day"] = df2["Day"]
df.loc[(df.Iter==0) & (df2.Year), "Month"] = df2["Month"]
df.loc[(df.Iter==0) & (df2.Year), "Year"] = df2["Year"]

df.loc[(df.Iter==0) & (df2.Year),"Iter"] = iter
info(iter)