# Data Mining Assignment 2

### 1.1 Load Dataset

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display, Markdown, Latex
pd.set_option('max_colwidth', 1000)

import re, os
for d in ["src", "data", "output"]: os.makedirs(d, exist_ok=True)

In [2]:
URL = "https://kmurphy.bitbucket.io/modules/Data_Mining_2/topics/08-Text_Mining/20-Mining_Dates/files/"

for filename, dest in [("public.csv", "src")]:
    
    source = f"{URL}/{filename}"
    target = f"{dest}/{filename}"
    
    if not os.path.isfile(target):
        print (f"Downloading remote file {filename} to folder {dest}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print (f"Using local copy of {filename} in folder {dest}")

Using local copy of public.csv in folder src


In [3]:
df = pd.read_csv("src/public.csv")
df.head()

Unnamed: 0,Code,Raw
0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor, Mcalester."
1,1039370009,".April, 5 97: made a phone call to Mom and Mom commented that he was talking very fast, hard to interrupt, but was in super happy spirits, so didn't make a big deal of it."
2,1039574613,"A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue (s/p partial glossectomy and neck dissection in 8/1974) referred to psycho-oncology for assistance with adjustment issues following recovery. The patient does not meet criteria for a major mood or anxiety disorder. She is not at imminent risk of harm to self or others. She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement, and to think through how to continue to create a life for herself moving forward."
3,1039963589,"October 7, 01 [report_end]"
4,1048901075,"July, 4, 01 Primary Care Doctor:"


### 1.2 Cleaning Step

In [4]:
df["Data"] = df["Raw"].replace({'\(':'', '\)':'', '\.':'', '\,':'', '\:':'', '\+':'', '\s+':' '}, regex=True)
df["Iter"] = 0
df["Match"] = 0
df["Day"] = 0
df["Month"] = 0
df["Year"] = 0
df.head()

Unnamed: 0,Code,Raw,Data,Iter,Match,Day,Month,Year
0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor, Mcalester.",12 Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor Mcalester,0,0,0,0,0
1,1039370009,".April, 5 97: made a phone call to Mom and Mom commented that he was talking very fast, hard to interrupt, but was in super happy spirits, so didn't make a big deal of it.",April 5 97 made a phone call to Mom and Mom commented that he was talking very fast hard to interrupt but was in super happy spirits so didn't make a big deal of it,0,0,0,0,0
2,1039574613,"A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue (s/p partial glossectomy and neck dissection in 8/1974) referred to psycho-oncology for assistance with adjustment issues following recovery. The patient does not meet criteria for a major mood or anxiety disorder. She is not at imminent risk of harm to self or others. She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement, and to think through how to continue to create a life for herself moving forward.",A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue s/p partial glossectomy and neck dissection in 8/1974 referred to psycho-oncology for assistance with adjustment issues following recovery The patient does not meet criteria for a major mood or anxiety disorder She is not at imminent risk of harm to self or others She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement and to think through how to continue to create a life for herself moving forward,0,0,0,0,0
3,1039963589,"October 7, 01 [report_end]",October 7 01 [report_end],0,0,0,0,0
4,1048901075,"July, 4, 01 Primary Care Doctor:",July 4 01 Primary Care Doctor,0,0,0,0,0


### 1.3 Helper Functions

In [5]:
def info(n=None, unmatched=True, head=5):
    print("Helper function to display results of a match and some rows not matched so far.")
    
    n = df.Match.max() if n is None else n
    for nn in ([n, 0] if n!=0 and unmatched else [n]):
        display(Markdown("**Number of rows with iter=%s: %s**" % (nn, len(df[df.Iter==nn]))))
        display(df.loc[df.Iter==nn, ["Code", "Data","Iter", "Match", "Day", "Month", "Year"]].head(head))
        df.loc[df.Iter==nn, ["Code", "Data","Iter", "Match", "Day", "Month", "Year"]].to_csv(r"output/current.csv", index=False)
        


In [6]:
def verify_dates(df):
    # TODO
    return df

### 1.4 Sequence of Regex's

#### 1.4.1 mm/dd/yyyy

In [7]:
# Construct Regex
tmp = df.loc[df.Code==1148116416, "Data"]
display(tmp)

regex = r"(?P<Month>[\d]{1,2})/(?P<Day>[\d]{1,2})/(?P<Year>[\d]{2,4})"
#regex = r"(?P<Month>\d+)/(?P<Day>\d+)/(?P<Year>\d+)"
tmp.str.extract(regex)

9    24 yo right handed woman with history of large right frontal mass s/p resection 11/3/1985 who had recent urgent R cranial wound revision and placement of L EVD for declining vision and increased drainage from craniotomy incision site and possible infection She has a hx of secondary mania related to psychosis and manipulation of her right frontal lobe
Name: Data, dtype: object

Unnamed: 0,Month,Day,Year
9,11,3,1985


In [8]:
# Apply Regex
iter = 1

df2 = df[df.Iter==0].Data.str.extract(regex)
df2.dropna(inplace=True)
display(Markdown(" * Number of row matched = %s" % df2.shape[0]))
display(df2.head())

 * Number of row matched = 120

Unnamed: 0,Month,Day,Year
9,11,3,1985
10,4,19,91
14,7,29,1994
16,6,10,72
18,6,18,85


0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: Year, dtype: int64

In [506]:
# Test Result
df2 = verify_dates(df2)

In [507]:
# Save Result
df.loc[(df.Iter==0) & (df2.Year), "Day"] = df2["Day"]
df.loc[(df.Iter==0) & (df2.Year), "Month"] = df2["Month"]
df.loc[(df.Iter==0) & (df2.Year), "Year"] = df2["Year"]

df.loc[(df.Iter==0) & (df2.Year), "Iter"] = iter
info(iter)

Helper function to display results of a match and some rows not matched so far.


**Number of rows with iter=1: 120**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
4,1148116416,24 yo right handed woman with history of large right frontal mass s/p resection 11/3/1985 who had recent urgent R cranial wound revision and placement of L EVD for declining vision and increased drainage from craniotomy incision site and possible infection She has a hx of secondary mania related to psychosis and manipulation of her right frontal lobe,1,0,3,11,1985
5,1157934136,4/19/91 Communication with referring physician? Not Done,1,0,19,4,91
8,1191233809,07/29/1994 CPT code 99203,1,0,29,7,1994
10,1218956332,6/10/72 SOS-10 Total Score,1,0,10,6,72
11,1220889324,6/18/85 Primary Care Doctor,1,0,18,6,85


**Number of rows with iter=0: 347**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
0,1039574613,A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue s/p partial glossectomy and neck dissection in 8/1974 referred to psycho-oncology for assistance with adjustment issues following recovery The patient does not meet criteria for a major mood or anxiety disorder She is not at imminent risk of harm to self or others She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement and to think through how to continue to create a life for herself moving forward,0,0,0,0,0
1,1054311047,and 8mo in 2009,0,0,0,0,0
2,1054668034,HTN hypercholesterolemia DM sleep apnea nephrolithiasis chronic renal impairment DVT since July 1977 on enoxaparin,0,0,0,0,0
3,1125769793,Since 10/2014 Fatigued more forgetful impaired dexterity on her left hand MRI reveals an approximately 42cm x 33cm x 25cm right parietal enhancing mass with surrounding edema,0,0,0,0,0
6,1162722894,s6 past psychiatric hospitalizations starting at age 16 Last 3/2003 for SIB/SI WWL x 2 Getwell Hospital Lincoln Hospital cOX nORTH Lotus ClinicHx of Outpatient Treatment Yes,0,0,0,0,0


#### 1.4.2 mm/yyyy

In [508]:
# Construct Regex
tmp = df.loc[df.Code==1125769793, "Data"]
display(tmp)

regex = r"(?P<Month>[\d]{1,2})/(?P<Year>[\d]{2,4})"
#regex = r"(?P<Month>\d+)/(?P<Day>\d+)/(?P<Year>\d+)"
tmp.str.extract(regex)

3    Since 10/2014 Fatigued more forgetful impaired dexterity on her left hand MRI reveals an approximately 42cm x 33cm x 25cm right parietal enhancing mass with surrounding edema
Name: Data, dtype: object

Unnamed: 0,Month,Year
3,10,2014


In [509]:
# Apply Regex
iter = 2

df2 = df[df.Iter==0].Data.str.extract(regex)
df2.dropna(inplace=True)
display(Markdown(" * Number of row matched = %s" % df2.shape[0]))
display(df2.head())

 * Number of row matched = 106

Unnamed: 0,Month,Year
0,8,1974
3,10,2014
6,3,2003
7,3,1990
28,5,2006


In [510]:
# Test result
# Save Result
df.loc[(df.Iter==0) & (df2.Year), "Day"] = 1
df.loc[(df.Iter==0) & (df2.Year), "Month"] = df2["Month"]
df.loc[(df.Iter==0) & (df2.Year), "Year"] = df2["Year"]

df.loc[(df.Iter==0) & (df2.Year), "Iter"] = iter
info(iter)

Helper function to display results of a match and some rows not matched so far.


**Number of rows with iter=2: 106**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
0,1039574613,A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue s/p partial glossectomy and neck dissection in 8/1974 referred to psycho-oncology for assistance with adjustment issues following recovery The patient does not meet criteria for a major mood or anxiety disorder She is not at imminent risk of harm to self or others She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement and to think through how to continue to create a life for herself moving forward,2,0,1,8,1974
3,1125769793,Since 10/2014 Fatigued more forgetful impaired dexterity on her left hand MRI reveals an approximately 42cm x 33cm x 25cm right parietal enhancing mass with surrounding edema,2,0,1,10,2014
6,1162722894,s6 past psychiatric hospitalizations starting at age 16 Last 3/2003 for SIB/SI WWL x 2 Getwell Hospital Lincoln Hospital cOX nORTH Lotus ClinicHx of Outpatient Treatment Yes,2,0,1,3,2003
7,1174525826,kNotice that in 03/1990 sustained a bizarre injury He was in Colorado City at the time He was driving his car and he says he had recently ran out of Saphris which is an antipsychotic he was taking He says he does not recall all the events but believes he stepped out of his vehicle and then walked off of a bridge sustaining a seven-story fall He was found unconscious He was taken to and treated at Norfolk Health Center in Colorado City where he underwent open reduction internal fixation of the right humerus as well as the left femur was in ICU for a week multiple fx He subsequently recovered from his injuries in the state of South Carolina,2,0,1,3,1990
28,1583575271,a Endometriosis dx on laparoscopy 5/2006,2,0,1,5,2006


**Number of rows with iter=0: 241**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
1,1054311047,and 8mo in 2009,0,0,0,0,0
2,1054668034,HTN hypercholesterolemia DM sleep apnea nephrolithiasis chronic renal impairment DVT since July 1977 on enoxaparin,0,0,0,0,0
9,1215203598,14 Jan 1981 SOS-10 Total Score,0,0,0,0,0
14,1271563547,n Abilify added to Lexapro Wellbutrin in Jan 2007,0,0,0,0,0
15,1280100040,11 Nov 2004 Total time of visit in minutes,0,0,0,0,0


#### 1.4.3 dd Month yyyy

In [511]:
# Construct Regex
#codes = [1704488947, ]
tmp = df.loc[df.Code==1215203598, "Data"]
display(tmp)

regex = (r"(?P<Day>[\d]{1,2}) (?P<Month>[a-zA-Z]+) (?P<Year>[\d]{2,4})")
#regex = r"(?P<Month>\d+)/(?P<Day>\d+)/(?P<Year>\d+)"
tmp.str.extract(regex)

9    14 Jan 1981 SOS-10 Total Score
Name: Data, dtype: object

Unnamed: 0,Day,Month,Year
9,14,Jan,1981


In [512]:
# Apply Regex
iter = 3

df2 = df[df.Iter==0].Data.str.extract(regex)
df2.dropna(inplace=True)
display(Markdown(" * Number of row matched = %s" % df2.shape[0]))
display(df2.head())

 * Number of row matched = 72

Unnamed: 0,Day,Month,Year
9,14,Jan,1981
15,11,Nov,2004
17,14,Oct,1996
19,22,Jan,1996
27,30,Nov,1972


In [513]:
# Test result
# Save Result
df.loc[(df.Iter==0) & (df2.Year), "Day"] = df2["Day"]
df.loc[(df.Iter==0) & (df2.Year), "Month"] = df2["Month"]
df.loc[(df.Iter==0) & (df2.Year), "Year"] = df2["Year"]

df.loc[(df.Iter==0) & (df2.Year), "Iter"] = iter
info(iter)

Helper function to display results of a match and some rows not matched so far.


**Number of rows with iter=3: 72**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
9,1215203598,14 Jan 1981 SOS-10 Total Score,3,0,14,Jan,1981
15,1280100040,11 Nov 2004 Total time of visit in minutes,3,0,11,Nov,2004
17,1343905547,14 Oct 1996 SOS-10 Total Score,3,0,14,Oct,1996
19,1359228152,22 Jan 1996 @ 11 AMCommunication with referring physician? Done,3,0,22,Jan,1996
27,1575273975,30 Nov 1972 SOS-10 Total Score,3,0,30,Nov,1972


**Number of rows with iter=0: 169**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
1,1054311047,and 8mo in 2009,0,0,0,0,0
2,1054668034,HTN hypercholesterolemia DM sleep apnea nephrolithiasis chronic renal impairment DVT since July 1977 on enoxaparin,0,0,0,0,0
14,1271563547,n Abilify added to Lexapro Wellbutrin in Jan 2007,0,0,0,0,0
16,1304079161,Pt joined Army reserves in 2001 and has 3 years left in this commitment-Mental Status Exam Was the exam performed? If not indicate reason Yes,0,0,0,0,0
18,1348968961,sgoiter--diagnosed in September 1981 Pt feels thyroid problems are related to h/o lithium use,0,0,0,0,0


#### 1.4.4 Month yyyy

In [514]:
# Construct Regex
tmp = df.loc[df.Code==1271563547, "Data"]
display(tmp)

regex = (r"(?P<Month>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?P<Year>[\d]{4})")
#regex = r"(?P<Month>\d+)/(?P<Day>\d+)/(?P<Year>\d+)"
tmp.str.extract(regex)

14    n Abilify added to Lexapro Wellbutrin in Jan 2007
Name: Data, dtype: object

Unnamed: 0,Month,Year
14,Jan,2007


In [515]:
# Apply Regex
iter = 4

df2 = df[df.Iter==0].Data.str.extract(regex)
df2.dropna(inplace=True)
display(Markdown(" * Number of row matched = %s" % df2.shape[0]))
display(df2.head())

 * Number of row matched = 97

Unnamed: 0,Month,Year
2,Jul,1977
14,Jan,2007
18,Sep,1981
24,Sep,1983
26,Sep,1985


In [516]:
# Test result
# Save Result
df.loc[(df.Iter==0) & (df2.Year), "Day"] = 1
df.loc[(df.Iter==0) & (df2.Year), "Month"] = df2["Month"]
df.loc[(df.Iter==0) & (df2.Year), "Year"] = df2["Year"]

df.loc[(df.Iter==0) & (df2.Year), "Iter"] = iter
info(iter)

Helper function to display results of a match and some rows not matched so far.


**Number of rows with iter=4: 97**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
2,1054668034,HTN hypercholesterolemia DM sleep apnea nephrolithiasis chronic renal impairment DVT since July 1977 on enoxaparin,4,0,1,Jul,1977
14,1271563547,n Abilify added to Lexapro Wellbutrin in Jan 2007,4,0,1,Jan,2007
18,1348968961,sgoiter--diagnosed in September 1981 Pt feels thyroid problems are related to h/o lithium use,4,0,1,Sep,1981
24,1451016026,sSep 1983 GSW to face L-TMJ region ? gang related with L-CN VII injury and ? TBI requiring plastic surgical reconstructionActivities of Daily Living ADL Bathing Independent,4,0,1,Sep,1983
26,1524444733,s 20 yo M carries dx of BPAD presents for psychopharm consult Moved to Independence area for school as of September 1985,4,0,1,Sep,1985


**Number of rows with iter=0: 72**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
1,1054311047,and 8mo in 2009,0,0,0,0,0
16,1304079161,Pt joined Army reserves in 2001 and has 3 years left in this commitment-Mental Status Exam Was the exam performed? If not indicate reason Yes,0,0,0,0,0
20,1375836275,1 Ex-smoker quit 2012,0,0,0,0,0
22,1388842679,shx of TBI 1975 ISO MVAMedical History,0,0,0,0,0
25,1499650595,September 15 2011 Total time of visit in minutes,0,0,0,0,0


#### 1.4.5 Month dd yyyy

In [524]:
# Construct Regex
tmp = df.loc[df.Code==7783575950, "Data"]
display(tmp)

regex = (r"(?P<Month>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?P<Day>[\d]{1,2}) (?P<Year>[\d]{4})")
#regex = r"(?P<Month>\d+)/(?P<Day>\d+)/(?P<Year>\d+)"
tmp.str.extract(regex)

354    Nov 11 1988 Total time of visit in minutes
Name: Data, dtype: object

Unnamed: 0,Month,Day,Year
354,Nov,11,1988


In [525]:
# Apply Regex
iter = 5

df2 = df[df.Iter==0].Data.str.extract(regex)
df2.dropna(inplace=True)
display(Markdown(" * Number of row matched = %s" % df2.shape[0]))
display(df2.head())

 * Number of row matched = 30

Unnamed: 0,Month,Day,Year
25,Sep,15,2011
54,Jan,27,1983
75,May,15,1989
76,Feb,18,1981
105,Oct,11,2013


In [526]:
# Test result
# Save Result
df.loc[(df.Iter==0) & (df2.Year), "Day"] = df2["Day"]
df.loc[(df.Iter==0) & (df2.Year), "Month"] = df2["Month"]
df.loc[(df.Iter==0) & (df2.Year), "Year"] = df2["Year"]

df.loc[(df.Iter==0) & (df2.Year), "Iter"] = iter
info(iter)

Helper function to display results of a match and some rows not matched so far.


**Number of rows with iter=5: 30**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
25,1499650595,September 15 2011 Total time of visit in minutes,5,0,15,Sep,2011
54,2027332027,Got back to US Jan 27 1983,5,0,27,Jan,1983
75,2371167874,May 15 1989 SOS-10 Total Score,5,0,15,May,1989
76,2414892617,Brother died February 18 1981 Parental/Caregiver obligations,5,0,18,Feb,1981
105,3039453427,none; but currently has appt with new HJH PCP Rachel Salas MD on October 11 2013 Other Agency Involvement No,5,0,11,Oct,2013


**Number of rows with iter=0: 42**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
1,1054311047,and 8mo in 2009,0,0,0,0,0
16,1304079161,Pt joined Army reserves in 2001 and has 3 years left in this commitment-Mental Status Exam Was the exam performed? If not indicate reason Yes,0,0,0,0,0
20,1375836275,1 Ex-smoker quit 2012,0,0,0,0,0
22,1388842679,shx of TBI 1975 ISO MVAMedical History,0,0,0,0,0
46,1858445338,1 Esophageal cancer dx 2013 on FOLFOX with oxaliplatin desensitization,0,0,0,0,0


#### 1.4.6 mm-dd-yyyy

In [532]:
# Construct Regex
tmp = df.loc[df.Code==3173136043, "Data"]
display(tmp)

regex = r"(?P<Month>[\d]{1,2})-(?P<Day>[\d]{1,2})-(?P<Year>[\d]{2,4})"
#regex = r"(?P<Month>\d+)/(?P<Day>\d+)/(?P<Year>\d+)"
tmp.str.extract(regex)

115    7-29-75 CPT Code 90801 - Psychiatric Diagnosis Interview
Name: Data, dtype: object

Unnamed: 0,Month,Day,Year
115,7,29,75


In [533]:
# Apply Regex
iter = 6

df2 = df[df.Iter==0].Data.str.extract(regex)
df2.dropna(inplace=True)
display(Markdown(" * Number of row matched = %s" % df2.shape[0]))
display(df2.head())

 * Number of row matched = 4

Unnamed: 0,Month,Day,Year
115,7,29,75
202,4,13,82
235,1,14,81
454,4,13,89


In [534]:
# Test result
# Save Result
df.loc[(df.Iter==0) & (df2.Year), "Day"] = df2["Day"]
df.loc[(df.Iter==0) & (df2.Year), "Month"] = df2["Month"]
df.loc[(df.Iter==0) & (df2.Year), "Year"] = df2["Year"]

df.loc[(df.Iter==0) & (df2.Year), "Iter"] = iter
info(iter)

Helper function to display results of a match and some rows not matched so far.


**Number of rows with iter=6: 4**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
115,3173136043,7-29-75 CPT Code 90801 - Psychiatric Diagnosis Interview,6,0,29,7,75
202,4895905688,4-13-82 Other Child Mental Health Outcomes Scales Used,6,0,13,4,82
235,5561299129,1-14-81 Communication with referring physician? Done,6,0,14,1,81
454,9752673858,4-13-89 Communication with referring physician? Not Done,6,0,13,4,89


**Number of rows with iter=0: 38**

Unnamed: 0,Code,Data,Iter,Match,Day,Month,Year
1,1054311047,and 8mo in 2009,0,0,0,0,0
16,1304079161,Pt joined Army reserves in 2001 and has 3 years left in this commitment-Mental Status Exam Was the exam performed? If not indicate reason Yes,0,0,0,0,0
20,1375836275,1 Ex-smoker quit 2012,0,0,0,0,0
22,1388842679,shx of TBI 1975 ISO MVAMedical History,0,0,0,0,0
46,1858445338,1 Esophageal cancer dx 2013 on FOLFOX with oxaliplatin desensitization,0,0,0,0,0


#### 1.4.7 yyyy