# Criminal and Civil Enforcement Data cleaning (Stage 1 - Reformatting)

Objectives:
- Relocate the geographical locations correctly
- Add new column for the source of database (Year - To see if it matches the date of the entry on the website or not)
- Reformat the date if necessary

- For early years: Get the geographical location as it's relatively straightforward


Next Stage includes:
- Identifying location and amount of money involved using `geograpy` and `money-parser`

In [1]:
import pandas
import numpy as np

In [2]:
# Initialize the dataframe
df_output = pd.DataFrame()

# Data cleaning for 2019 onwards (can be executed directly)

In [3]:
# Determine the current year
import datetime
current_year = datetime.datetime.now().year

In [4]:
for year in range(2019,current_year+1):
    filename = 'OIG_HHS_Scrape_' + str(year)
    df = pandas.read_csv('Raw/'+ filename + '_raw.csv') 
    
    #Rename columns for further command processing
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

    # Find out records with missing geographical subdivision or location
    df['Flag'] = np.where((df['geographical_subdivision'].isnull()) & (df['authority']!='U.S. Department of Justice'), 1, 0)
    df[df['Flag']==1]
    
    df2=df   # Save a temporary dataframe for reference later if necessary
    
    # Correct format of date
    if year==2019:
        df.date = df.date.str.replace('June 11 2019', 'June 11, 2019')
        df.date = df.date.str.replace('July 5,2019', 'July 5, 2019')
        df['date'] = pandas.to_datetime(df['date'])  
    
    
    # Combine to the right dataset
    del df['Flag']    # Drop the temporary Flag
    del df['unnamed:_0']
    df_info = pd.DataFrame()

##  2018 Data cleaning

In [5]:
filename = 'OIG_HHS_Scrape_2018'
df = pandas.read_csv('raw/'+filename + '.csv') 

In [6]:
#2018 Data cleaning

#Rename columns for further command processing
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

# Find out records with missing geographical subdivision or location
df['Flag'] = np.where((df['geographical_subdivision'].isnull()) & (df['authority']!='U.S. Department of Justice'), 1, 0)

# Replace each record individually to keep format consistent
list_copied = ['Western District of Virginia', 'District of Puerto Rico']
df['geographical_subdivision'] = np.where((df['geographical_subdivision'].isnull()) & ((df['authority'].isin(list_copied) == True)), 
                      df['authority'],  df['geographical_subdivision'])

df['geographical_subdivision'] = np.where((df['geographical_subdivision'].isnull()) & (df['authority']=='Louisiana Attorney General'), 
                      'Louisiana',  df['geographical_subdivision'])

df['geographical_subdivision'] = np.where((df['geographical_subdivision'].isnull()) & (df['authority']=='U.S. Attorney Northern District of New York'), 
                      'Northern District of New York',  df['geographical_subdivision'])
df['authority'] = np.where((df['authority']=='U.S. Attorney Northern District of New York'), 
                      'U.S. Attorney', df['authority'])

df[df['Flag']==1]
    

Unnamed: 0,unnamed:_0,section,date,authority,geographical_subdivision,medicare_fraud_strike_force_case,heading,description,hyperlink,Flag
52,52,Criminal and Civil Enforcement,9-Oct-18,Western District of Virginia,Western District of Virginia,0,Psychiatrist Pleads Guilty to Healthcare Fraud...,"Abingdon, VIRGINIA - A former psychiatrist who...",https://www.justice.gov/usao-wdva/pr/psychiatr...,1
184,184,Criminal and Civil Enforcement,18-May-18,U.S. Attorney,Northern District of New York,0,Kinderhook Podiatrist Sentenced for Health Car...,"ALBANY, NEW YORK - Perrin D. Edwards, D.P.M., ...",https://www.justice.gov/usao-ndny/pr/kinderhoo...,1
278,278,Criminal and Civil Enforcement,22-Feb-18,District of Puerto Rico,District of Puerto Rico,0,Mi Salud Program Technician Sentenced To Four ...,"SAN JUAN, P.R. - On February 22, 2018, defenda...",https://www.justice.gov/usao-pr/pr/mi-salud-pr...,1
280,280,Criminal and Civil Enforcement,21-Feb-18,Louisiana Attorney General,Louisiana,0,Two Louisiana Women Arrested for Medicaid Welf...,"BATON ROUGE, LA - Louisiana Attorney General J...",http://www.ag.state.la.us/Article/6537/5,1


In [7]:
# Add source of year as collection
df['Source'] = '2017/2018'

In [8]:
# Export to CSV
del df['Flag']    # Drop the temporary Flag
del df['unnamed:_0']
df.to_csv('Cleaned/Intermediate/' + filename+'_cleaned.csv')


# 2017 Data cleaning

In [9]:
filename = 'OIG_HHS_Scrape_2017'
df = pandas.read_csv('raw/'+filename + '.csv') 


#Rename columns for further command processing
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')


# Find out records with missing geographical subdivision or location
df['Flag'] = np.where((df['geographical_subdivision'].isnull()) & (df['authority']!='U.S. Department of Justice'), 1, 0)

list_copied = ['Middle District of Florida', 'District of Idaho']
df['geographical_subdivision'] = np.where((df['geographical_subdivision'].isnull()) & (df['authority'].isin(list_copied) == True), 
                        df['authority'], df['geographical_subdivision'])
df['authority'] = np.where((df['authority'].isin(list_copied) == True), '', df['authority'])

df['geographical_subdivision'] = np.where((df['geographical_subdivision'].isnull()) & (df['authority']=='New York Attorney General'), 
                        'New York', df['geographical_subdivision'])

In [10]:
# Add source of year (verify if year match with the date)
df['Source'] = 2017

In [11]:
# Export to csv using self-written function
exportcsv_cleaned(df, filename)

Exported file: Cleaned/OIG_HHS_Scrape_2017_cleaned.csv


# 2016 Data cleaning

In [12]:
filename = 'OIG_HHS_Scrape_2016'
df = pandas.read_csv('Raw/'+filename + '.csv') 


#Rename columns for further command processing
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')


# Find out records with missing geographical subdivision or location
df['Flag'] = np.where((df['geographical_subdivision'].isnull()) & (df['authority']!='U.S. Department of Justice'), 1, 0)


df['geographical_subdivision'] = np.where((df['Flag']==1), df['authority'], df['geographical_subdivision'])
df['authority'] = np.where((df['Flag']==1), '', df['authority'])


In [13]:
# Add source of year (verify if the date matches with the source)
df['Source'] = 2016

In [14]:
# Export to csv using self-written function
exportcsv_cleaned(df, filename)


Exported file: Cleaned/OIG_HHS_Scrape_2016_cleaned.csv


# 2015 Data cleaning

In [15]:
filename = 'OIG_HHS_Scrape_2015'
df = pandas.read_csv('Raw/'+filename + '.csv') 


#Rename columns for further command processing
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')


## Rename authority for an entry in which a space is missing
###########################################
df['authority'] = np.where(df['authority']=='U.S. Departmentof Justice', 'U.S. Department of Justice', df['authority'])



## Fill in geographical subdivision from information in authority section
#####################################################################

# Find out records with missing geographical subdivision or location
authority_exclusion = ['Department of Justice', 'U.S. Department of Justice']   # Excluding those authority that does not have geographical location invovled 


# Separate authority and geog subdivision for authority = "U.S. Attorney" + (subdivision)

df['Flag'] = np.where((df['geographical_subdivision'].isnull()) & (df['authority'].isin(authority_exclusion) == False), 1, 0)

df['geographical_subdivision'] = np.where((df['authority'].str.startswith('U.S. Attorney') == True) & (df['Flag']==1), 
                                          df['authority'].str[14:], df['geographical_subdivision'])

df['authority'] = np.where((df['authority'].str.startswith('U.S. Attorney') == True) & (df['Flag']==1), 
                                        'U.S. Attorney' , df['authority'])


df['geographical_subdivision'] = np.where((df['authority'].str.startswith('U.S. Attorney') == False) &(df['Flag']==1), 
                                            df['authority'], df['geographical_subdivision'])
df['authority'] = np.where((df['authority'].str.startswith('U.S. Attorney') == False) & (df['Flag']==1), '', df['authority'])



## Fill in authority in case it is not able to separate from date
############################################
df['authority']= np.where(df['date'].str.find(';')>0,  df['date'].str[17:], df['authority'])
df['date']= np.where(df['date'].str.find(';')>0,  df['date'].str[0:16], df['date'])


In [16]:
# Add source of year (verify if year match with the date)
df['Source'] = 2015

In [17]:
#Export to csv using self-written function
############################################

exportcsv_cleaned(df, filename)

Exported file: Cleaned/OIG_HHS_Scrape_2015_cleaned.csv


# 2014 Data cleaning

In [18]:
filename = 'OIG_HHS_Scrape_2014'
df = pandas.read_csv('raw/'+filename + '.csv') 





# Rename columns for further command processing
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')


# Find out records with missing geographical subdivision or location
authority_exclusion = ['Department of Justice', 'U.S. Department of Justice']   # Excluding those authority that does not have geographical location invovled 
df['Flag'] = np.where((df['geographical_subdivision'].isnull()) & (df['authority'].isin(authority_exclusion) == False), 1, 0)


# Separate authority and geog subdivision for authority = "U.S. Attorney" + (subdivision)

df['geographical_subdivision'] = np.where((df['authority'].str.startswith('U.S. Attorney') == True) & (df['Flag']==1), 
                                          df['authority'].str[14:], df['geographical_subdivision'])

df['authority'] = np.where((df['authority'].str.startswith('U.S. Attorney') == True) & (df['Flag']==1), 
                                        'U.S. Attorney' , df['authority'])


df['geographical_subdivision'] = np.where((df['authority'].str.startswith('U.S. Attorney') == False) &(df['Flag']==1), 
                                            df['authority'], df['geographical_subdivision'])
df['authority'] = np.where((df['authority'].str.startswith('U.S. Attorney') == False) & (df['Flag']==1), '', df['authority'])





## Fill in authority in case it is not able to separate from date
############################################

df['authority']= np.where(df['date'].str.find(';')>0,  df['date'].str[17:], df['authority'])
df['date']= np.where(df['date'].str.find(';')>0,  df['date'].str[0:16], df['date'])

#df[df['date'].str.find(';')>0]     #Check results only


df[df['Flag']==1]






Unnamed: 0,unnamed:_0,section,date,authority,geographical_subdivision,medicare_fraud_strike_force_case,heading,description,hyperlink,Flag
16,16,Criminal and Civil Enforcement,"December 12, 2014",,Middle District of Florida,0,United States Settles False Claims Act Allegat...,"Jacksonville, FL B The United States has settl...",http://www.justice.gov/usao/flm/press/2014/Dec...,1
65,65,Criminal and Civil Enforcement,"November 5, 2014",U.S. Department of Justice,,1,Detroit-Area Man Arrested in Connection with H...,A Detroit-area resident was arrested today for...,http://www.justice.gov/opa/pr/detroit-area-man...,1
340,340,Criminal and Civil Enforcement,"May 2, 2014",U.S. Attorney,Northern District of Indiana,0,Guilty Plea in Conspiracy to Receive Kickbacks...,"Linda Rosenberg, 60, of Chicago, Illinois, ple...",http://www.justice.gov/usao/inn/press_release/...,1
363,363,Criminal and Civil Enforcement,"April 17, 2014",,District of New Mexico,0,Pawan Kumar Jain Arrested on Charges of Unlawf...,ALBUQUERQUE - A federal grand jury has returne...,http://www.justice.gov/usao/nm/press-releases/...,1
464,464,Criminal and Civil Enforcement,"February 13, 2014",,Eastern District of Washington,0,Walla Walla Man Sentenced To Probation And Ord...,"Richland, Washington - Michael C. Ormsby, Unit...",http://www.justice.gov/usao/wae/news/2014/2014...,1
528,528,Criminal and Civil Enforcement,"January 7, 2014",U.S. Attorney,Eastern District of Washington,0,United States Department Of Justice And The St...,"Spokane - Michael C. Ormsby, United States Att...",http://www.justice.gov/usao/wae/news/2014/2014...,1


In [19]:
# Standardize format of date

#(1) Remove the days of week in some of the entry
weekdays_replace = ['Sunday, ', 'Monday, ', 'Tuesday, ', 'Wednesday, ', 'Thursday, ', 'Friday, ', 'Saturday, ']
for days in weekdays_replace:
    df['date'] = df.date.str.replace(days, '')


In [20]:
#(2) Add back the year if year is missing
df['date'] = np.where(df.date.str[-6:-5]!=',', df.date.astype(str) + ', 2014', df.date)

In [21]:
# Add source of year (verify if year match with the date)
df['Source'] = 2014

In [22]:

#Export to csv using self-written function
############################################
exportcsv_cleaned(df, filename)

Exported file: Cleaned/OIG_HHS_Scrape_2014_cleaned.csv


## 2013 Data cleaning

In [23]:
filename = 'OIG_HHS_Scrape_2013'
df = pandas.read_csv('Raw/'+filename + '.csv') 


# Rename columns for further command processing
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')


# Find out records with missing geographical subdivision or location
df['Flag'] = np.where((df['geographical_subdivision'].isnull()) & (df['authority'].isin(authority_exclusion) == False), 1, 0)

# Separate authority and geog subdivision for authority = "U.S. Attorney" + (subdivision)
df['geographical_subdivision'] = np.where((df['authority'].str.startswith('U.S. Attorney') == True) & (df['Flag']==1), 
                                          df['authority'].str[14:], df['geographical_subdivision'])
df['authority'] = np.where((df['authority'].str.startswith('U.S. Attorney') == True) & (df['Flag']==1), 
                                        'U.S. Attorney' , df['authority'])

# Minor edition in format of geographical sublocation
df['geographical_subdivision'] = np.where((df['geographical_subdivision'].str.startswith('for the') == True) & (df['Flag']==1), 
                                       df.geographical_subdivision.str.replace('for the ', '') , df['geographical_subdivision'])




In [24]:
# Add source of year (verify if year match with the date)
df['Source'] = 2013

In [25]:
#Export to csv using self-written function
############################################
exportcsv_cleaned(df, filename)

Exported file: Cleaned/OIG_HHS_Scrape_2013_cleaned.csv


## 2012 Data cleaning

In [26]:
year = 2012

filename = 'OIG_HHS_Scrape_' + str(year)
df = pandas.read_csv('Raw/'+filename + '.csv') 


# Rename columns for further command processing
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')


# Find out records with missing geographical subdivision or location
df['Flag'] = np.where((df['geographical_subdivision'].isnull()) & (df['authority'].isin(authority_exclusion) == False), 1, 0)

# Separate authority and geog subdivision for authority = "U.S. Attorney" + (subdivision)
df['geographical_subdivision'] = np.where((df['authority'].str.startswith('U.S. Attorney') == True) & (df['Flag']==1), 
                                          df['authority'].str[14:], df['geographical_subdivision'])
df['authority'] = np.where((df['authority'].str.startswith('U.S. Attorney') == True) & (df['Flag']==1), 
                                        'U.S. Attorney' , df['authority'])

# Minor edition in format of geographical sublocation
df['geographical_subdivision'] = np.where((df['geographical_subdivision'].str.startswith('for the') == True) & (df['Flag']==1), 
                                       df.geographical_subdivision.str.replace('for the ', '') , df['geographical_subdivision'])




In [27]:
# Add source of year (verify if year match with the date)
df['Source'] = year

In [28]:
# Separate those records with authority/geographical subdivision info mixed in geog location

df['Flag'] = np.where(df['date'].str.find(':')>0, 1, 0)

df['geographical_subdivision'] = np.where(df.date.str[-13:] == 'U.S. Attorney', df['authority'], df['geographical_subdivision'])
df['authority'] = np.where(df.date.str[-13:] == 'U.S. Attorney', 'U.S. Attorney' , df['authority'])
df['date'] = np.where(df.date.str[-13:] == 'U.S. Attorney', df.date.str.replace(': U.S. Attorney','') , df['date'])


df['geographical_subdivision'] = np.where(df.date.str[-len('Eastern District of Tennessee'):] == 'Eastern District of Tennessee', 
                                          'Eastern District of Tennessee', df['geographical_subdivision'])
df['date'] = np.where(df.date.str[-len('Eastern District of Tennessee'):] == 'Eastern District of Tennessee', 
                      df.date.str.replace(': Eastern District of Tennessee','') , df['date'])



df[df['Flag']==1]


Unnamed: 0,unnamed:_0,section,date,authority,geographical_subdivision,medicare_fraud_strike_force_case,heading,description,hyperlink,Flag,Source
39,39,Criminal and Civil Enforcement,"December 3, 2012",,Eastern District of Tennessee,0,The United States has filed a Complaint-in-Int...,,/fraud/enforcement/criminal/2012/Life_Care_Com...,1,2012
307,307,Criminal and Civil Enforcement,"May 11, 2012",U.S. Attorney,Eastern District of Michigan,0,Detroit-area Physician Convicted in $6.7 Milli...,"A federal jury sitting in Detroit, Michigan, c...",http://www.justice.gov/usao/mie/news/2012/2012...,1,2012


In [29]:
#Export to csv using self-written function
############################################
exportcsv_cleaned(df, filename)


Exported file: Cleaned/OIG_HHS_Scrape_2012_cleaned.csv


## 2011 Data cleaning

In [30]:
year = 2011

filename = 'OIG_HHS_Scrape_' + str(year)
df = pandas.read_csv('Raw/'+filename + '.csv') 

# # Rename columns for further command processing
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')



In [31]:
## Split date and authority

df['Flag'] =  np.where((df['authority'].isnull()) & 
                       (df['geographical_subdivision'].isnull()) & (df['date'].str.endswith(str(year)) == False), 1,0)

df['authority'] = np.where((df['authority'].isnull()) & 
                           (df['geographical_subdivision'].isnull()) & (df['date'].str.endswith(str(year)) == False),
                            df['date'].str.split(", 2011, ").str[1], df['authority'] )

df['date'] = np.where((df['Flag']==1), df['date'].str.split(", 2011, ").str[0].astype(str)+', ' + str(year), df['date'] )


df[df['Flag']==1]

Unnamed: 0,unnamed:_0,section,date,authority,geographical_subdivision,heading,description,hyperlink,Flag
271,271,Criminal and Civil Enforcement,"March 29, 2011","Federal Bureau of Investigation, Washington Fi...",,FDA Chemist and Son Charged with Trading on In...,,http://washingtondc.fbi.gov/dojpressrel/pressr...,1
275,275,Criminal and Civil Enforcement,"March 28, 2011","U.S. Attorney, Northern District of Georgia",,Physician Allegedly Used Purported Charitable ...,,http://www.justice.gov/usao/gan/press/2011/03-...,1
278,278,Criminal and Civil Enforcement,"March 24, 2011","U.S. Attorney, Southern District of Indiana",,Connersville Woman Sentenced for Medicaid Fraud,,http://www.justice.gov/usao/ins/press_releases...,1
279,279,Criminal and Civil Enforcement,"March 23, 2011","U.S. Attorney, Eastern District of Michigan",,Monroe Doctor Arrested on Drug and Health Care...,,http://www.justice.gov/usao/mie/news/2011/2011...,1
280,280,Criminal and Civil Enforcement,"March 16, 2011","Federal Bureau of Investigation, Jackson, Miss...",,Two Men Indicted in Health Care Fraud Scheme,,http://jackson.fbi.gov/dojpressrel/pressrel11/...,1
...,...,...,...,...,...,...,...,...,...
373,373,Criminal and Civil Enforcement,"January 10, 2011","U.S. Attorney, District of New Jersey",,CVS Pays Nearly $1 Million to Resolve Allegati...,,http://www.justice.gov/usao/nj/press/press/fil...,1
374,374,Criminal and Civil Enforcement,"January 6, 2011","U.S. Attorney, District of Maryland",,Greater Metropolitan Orthopaedics Institute Ag...,,http://www.justice.gov/usao/md/Public-Affairs/...,1
375,375,Criminal and Civil Enforcement,"January 6, 2011",U.S. Department of Justice),,New Orleans Doctor and Owner of Medical Equipm...,,http://www.justice.gov/opa/pr/2011/January/11-...,1
376,376,Criminal and Civil Enforcement,"January 4, 2011",U.S. Department of Justice,,Seven Hospitals in Six States to Pay U.S. more...,,http://www.justice.gov/opa/pr/2011/January/11-...,1


In [32]:
# Shift geographical info. in "authority" field to correct position

df['Flag'] = np.where((df['geographical_subdivision'].isnull()) & (df['date'].str.endswith(str(year)) == False), 1, 0 )

df['geographical_subdivision'] = np.where(df['Flag']==1, df['authority'], df['geographical_subdivision'] )


df['authority'] = np.where(df['Flag']==1,
                            df['date'].str.split(", 2011 - ").str[1], df['authority'] )

df['date'] = np.where((df['Flag']==1), df['date'].str.split(", 2011 - ").str[0].astype(str)+', ' + str(year), df['date'] )


df[df['Flag']==1]

Unnamed: 0,unnamed:_0,section,date,authority,geographical_subdivision,heading,description,hyperlink,Flag
107,107,Criminal and Civil Enforcement,"September 30, 2011",U.S. Attorney,Southern District of Georgia,Miami Man Sentenced to More Than 11 Years In P...,"SAVANNAH, GA - Alfredo Rasco, 52, from Miami, ...",http://www.justice.gov/usao/gas/pr/2011/52Rasc...,1


In [33]:

# Find out records with missing geographical subdivision or location
authority_exclusion = ['Department of Justice', 'U.S. Department of Justice']   # Excluding those authority that does not have geographical location invovled 
df['Flag'] = np.where((df['geographical_subdivision'].isnull()) & (df['authority'].isin(authority_exclusion) == False), 1, 0)



# Separate authority and geog subdivision for authority = "U.S. Attorney" + (subdivision)
df['geographical_subdivision'] = np.where((df['authority'].str.startswith('U.S. Attorney\'s Office, ')==True)  & (df['Flag']==1), 
                                          df['authority'].str.split('U.S. Attorney\'s Office,').str[1], df['geographical_subdivision'])
df['authority'] = np.where((df['authority'].str.startswith('U.S. Attorney\'s Office, ')==True)  & (df['Flag']==1), 
                                          'U.S. Attorney\'s Office', df['authority'])


df['geographical_subdivision'] = np.where((df['authority'].str.startswith('U.S. Attorney') == True) &
                                          (df['authority'].str.startswith('U.S. Attorney\'s Office')==False)  & (df['Flag']==1), 
                                           df['authority'].str.lstrip('U.S. Attorney ') , df['geographical_subdivision'])


df['authority'] = np.where((df['authority'].str.startswith('U.S. Attorney') == True) &
                                          (df['authority'].str.startswith('U.S. Attorney\'s Office')==False)  & (df['Flag']==1), 
                                          'U.S. Attorney' , df['authority'])

# # # Minor edition in format of geographical sublocation
df['geographical_subdivision'] = np.where((df['geographical_subdivision'].str.startswith('for the') == True) & (df['Flag']==1), 
                                       df.geographical_subdivision.str.replace('for the ', '') , df['geographical_subdivision'])
df['geographical_subdivision'] = np.where((df['geographical_subdivision'].str.startswith(', ') == True) & (df['Flag']==1), 
                                       df.geographical_subdivision.str.replace(', ', '') , df['geographical_subdivision'])

df[df['Flag']==1]

Unnamed: 0,unnamed:_0,section,date,authority,geographical_subdivision,heading,description,hyperlink,Flag
13,13,Criminal and Civil Enforcement,"December 12, 2011",,,"Co-owners of Pocatello Physical Therapy, P.A. ...",POCATELLO - The co-owners of Pocatello Physica...,http://www.justice.gov/usao/id/news/2011/dec/p...,1
97,97,Criminal and Civil Enforcement,"October 11, 2011",U.S. Attorney,District of Minnesota,Home Health Care Agency Owner Pleads Guilty To...,"MINNEAPOLIS-Recently in federal court, the ope...",http://www.justice.gov/usao/mn/press/oct010.pdf,1
112,112,Criminal and Civil Enforcement,"September 28, 2011",U.S. Attorney,Southern District of Florida,Doral Woman Sentenced To 43 Months For Medicar...,"Wifredo A. Ferrer, United States Attorney for ...",http://www.justice.gov/usao/fls/PressReleases/...,1
137,137,Criminal and Civil Enforcement,"September 7, 2011",U.S. Department of Health and Human Services,,Medicare Fraud Strike Force Charges 91 Individ...,WASHINGTON - Attorney General Eric Holder and ...,http://www.hhs.gov/news/press/2011pres/09/2011...,1
198,198,Criminal and Civil Enforcement,"July 13, 2011",U.S. Attorney,Middle District of Tennessee,Tenncare Fraud Settlement Announced,,http://www.justice.gov/usao/tnm/pressReleases/...,1
...,...,...,...,...,...,...,...,...,...
371,371,Criminal and Civil Enforcement,"January 18, 2011",U.S. Attorney,Southern District of New York,Manhattan U.S. Attorney Announces $18 Million ...,,http://www.justice.gov/usao/nys/pressreleases/...,1
372,372,Criminal and Civil Enforcement,"January 10, 2011",U.S. Attorney,District of Connecticut,Former Haven Health Care Bookkeeper Pleads Gui...,,http://www.justice.gov/usao/ct/Press2011/20110...,1
373,373,Criminal and Civil Enforcement,"January 10, 2011",U.S. Attorney,District of New Jersey,CVS Pays Nearly $1 Million to Resolve Allegati...,,http://www.justice.gov/usao/nj/press/press/fil...,1
374,374,Criminal and Civil Enforcement,"January 6, 2011",U.S. Attorney,District of Maryland,Greater Metropolitan Orthopaedics Institute Ag...,,http://www.justice.gov/usao/md/Public-Affairs/...,1


In [34]:
# Separating authority and geographical subdivision (More general)

df['Flag'] = np.where(df['authority'].str.find(',') >=0, 1, 0 )

df['geographical_subdivision'] = np.where(df['Flag']==1, df['authority'].str.split(', ', 1).str[-1], df['geographical_subdivision'] )
df['authority'] = np.where(df['Flag']==1, df['authority'].str.split(', ', 1).str[0], df['authority'] )

df[df['Flag']==1]



Unnamed: 0,unnamed:_0,section,date,authority,geographical_subdivision,heading,description,hyperlink,Flag
205,205,Criminal and Civil Enforcement,"July 7, 2011",Federal Bureau of Investigation,Atlanta Division,Atlanta Radiologist Guilty of Fraudulently Pas...,,http://www.fbi.gov/atlanta/press-releases/2011...,1
248,248,Criminal and Civil Enforcement,"May 26, 2011",US Attorney,District of North Dakota,Keplin Sentenced for Illegal Distribution of I...,,http://www.justice.gov/usao/nd/pressreleases/2...,1
271,271,Criminal and Civil Enforcement,"March 29, 2011",Federal Bureau of Investigation,Washington Field Office,FDA Chemist and Son Charged with Trading on In...,,http://washingtondc.fbi.gov/dojpressrel/pressr...,1
280,280,Criminal and Civil Enforcement,"March 16, 2011",Federal Bureau of Investigation,"Jackson, Mississippi",Two Men Indicted in Health Care Fraud Scheme,,http://jackson.fbi.gov/dojpressrel/pressrel11/...,1
287,287,Criminal and Civil Enforcement,"March 9, 2011",Federal Bureau of Investigation,Houston,Jury Convicts Durable Medical Equipment Busine...,,http://houston.fbi.gov/dojpressrel/pressrel11/...,1
293,293,Criminal and Civil Enforcement,"March 4, 2011",Federal Bureau of Investigation,Indianapolis,Terre Haute Man Charged with Health Care Fraud...,,http://indianapolis.fbi.gov/dojpressrel/pressr...,1
294,294,Criminal and Civil Enforcement,"March 2, 2011",Federal Bureau of Investigation,Tampa,Five Former Executives Indicted on Health Care...,,http://tampa.fbi.gov/dojpressrel/pressrel11/ta...,1


In [35]:
#Removing unnecessary '(',')'symbols from unknown sources

symbol_to_remove = ['\(','\)','\>']
str_cols = [['date'], ['authority'] ,'geographical_subdivision']    # specify columns you want to replace

for col in str_cols:
    for symbol in symbol_to_remove:
        df[col] = df[col].replace(symbol, '', regex=True)

In [36]:
# Add source of year (verify if year match with the date)
df['Source'] = year



In [37]:
#Export to csv using self-written function
############################################
exportcsv_cleaned(df, filename)


Exported file: Cleaned/OIG_HHS_Scrape_2011_cleaned.csv


## 2009 - 2010 Data cleaning

In [38]:
for year in range(2009,2011):
    filename = 'OIG_HHS_Scrape_' + str(year)
    df = pandas.read_csv('Raw/'+filename + '.csv') 

    # # Rename columns for further command processing
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

    df


    ## Split date and authority

    df['Flag'] =  np.where((df['authority'].isnull()) & 
                           (df['geographical_subdivision'].isnull()), 1,0)

    df['authority'] = np.where((df['authority'].isnull()) & 
                               (df['geographical_subdivision'].isnull()) & (df['date'].str.endswith(str(year)) == False),
                                df['date'].str.split(str(year)+',').str[1].str.lstrip(' '), df['authority'] )

    df['date'] = np.where((df['Flag']==1), 
                          (np.where(df['date'].str.split(',').size>1,
                                  df['date'].str.split(',').str[0].astype(str)+', '+df['date'].str.split(',').str[1].astype(str),
                                  df['date'].str.split(',').str[0].astype(str)+' '+df['date'].str.split(',').str[1].astype(str))),
                          df['date'] )


    df[df['Flag']==1]

    # Separating authority and geographical subdivision (More general)

    df['Flag'] = np.where(df['authority'].str.find(',') >=0, 1, 0 )

    df['geographical_subdivision'] = np.where(df['Flag']==1, df['authority'].str.split(', ', 1).str[-1], df['geographical_subdivision'] )
    df['authority'] = np.where(df['Flag']==1, df['authority'].str.split(', ', 1).str[0].str.lstrip(' '), df['authority'] )

    df[df['Flag']==1]



    #Removing unnecessary '(',')'symbols from unknown sources

    symbol_to_remove = ['\(','\)','\>']
    str_cols = [['date'], ['authority'] ,'geographical_subdivision']    # specify columns you want to replace

    for col in str_cols:
        for symbol in symbol_to_remove:
            df[col] = df[col].replace(symbol, '', regex=True)


    # Add source of year (verify if year match with the date)
    df['Source'] = year



    #Export to csv using self-written function
    ############################################
    exportcsv_cleaned(df, filename)


Exported file: Cleaned/OIG_HHS_Scrape_2009_cleaned.csv
Exported file: Cleaned/OIG_HHS_Scrape_2010_cleaned.csv


## 2008 Data cleaning

In [39]:
year = 2008

filename = 'OIG_HHS_Scrape_' + str(year)
df = pandas.read_csv('Raw/'+filename + '.csv') 

# # Rename columns for further command processing
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [40]:
## Split date and authority 

df['Flag'] =  np.where(df['date'].str.endswith('Attorney'), 1,0)   # Field of dates consisting irrelevant information
df['geographical_subdivision'] = np.where(df['Flag']==1, df['authority'], df['geographical_subdivision'])
df['authority'] = np.where(df['Flag']==1, df['date'].str.split(str(year)+' ').str[1], df['authority'])
df['date'] = np.where(df['Flag']==1, df['date'].str.split(str(year)+' ').str[0] + ' 2008', df['date'])


df[df['Flag']!=0]

Unnamed: 0,unnamed:_0,section,date,authority,geographical_subdivision,heading,category,description,hyperlink,Flag
29,29,Criminal and Civil Enforcement,"August 18, 2008",U.S. Attorney,Southern District of Indiana,Indianapolis Woman Charged in $1.8 Million Hea...,,,http://www.usdoj.gov/usao/ins/press_releases/P...,1


In [41]:
# Add source of year (verify if year match with the date)
df['Source'] = year

In [42]:
# Separate geographical subdivision & authority (For US Attorney)

df['Flag'] =  np.where((df['authority'].str.find('Attorney')>0) & (df['authority'].str.endswith('Attorney')==False), 1,0)   # Field of dates consisting irrelevant information

df['geographical_subdivision'] = np.where(df['Flag']==1, df['authority'].str.split('Attorney ').str[1], df['geographical_subdivision'])
df['authority'] = np.where(df['Flag']==1, df['authority'].str.split('Attorney ').str[0]+('Attorney'), df['authority'])

df['geographical_subdivision'] = np.where((df['geographical_subdivision'].str.startswith('for the') == True), 
                                       df.geographical_subdivision.str.replace('for the ', '') , df['geographical_subdivision'])


df[df['Flag']!=0]


Unnamed: 0,unnamed:_0,section,date,authority,geographical_subdivision,heading,category,description,hyperlink,Flag,Source
17,17,Criminal and Civil Enforcement,"June 30, 2008",US Attorney,District of NJ,Another Cardiologist Settles with Government o...,,,http://www.usdoj.gov/usao/nj/press/press/files...,1,2008
19,19,Criminal and Civil Enforcement,"June 27, 2008",U.S. Attorney,District of KS,Trial Set for Wichita Daycare Provider Charged...,,,http://www.usdoj.gov/usao/ks/press/June2008/Ju...,1,2008
22,22,Criminal and Civil Enforcement,"June 10, 2008",U.S. Attorney,Eastern District of PA,U.S. Attorney Announces First Settlement Invol...,,,http://www.usdoj.gov/usao/pae/News/Pr/2008/jun...,1,2008
23,23,Criminal and Civil Enforcement,"July 2, 2008",U.S. Attorney,Southern District of FL,DME Defendants Plead Guilty to $148 Million Me...,,,http://www.usdoj.gov/usao/fls/PressReleases/08...,1,2008
66,66,Criminal and Civil Enforcement,"November 26, 2008",U.S. Attorney,District of Columbia,The United States and the District of Columbia...,,,http://www.usdoj.gov/usao/dc/Press_Releases/20...,1,2008


In [43]:
# Artifically restore geographical subdivision info in authoirty into correct position
loc_list = ['Middle District of PA', 
            'City of New York Department of Investigation']

df['Flag'] = np.where(df['authority'].isin(loc_list),1,0)

df['geographical_subdivision'] = np.where(df['Flag']==1, df['authority'], df['geographical_subdivision'])

df['authority'] = df['authority'].apply(lambda x: '' if x==loc_list[0] else x)

df['geographical_subdivision'] = df['geographical_subdivision'].apply(lambda x: loc_list[1].split('Department')[0] if x==loc_list[1] else x)


df[df['Flag']==1]


Unnamed: 0,unnamed:_0,section,date,authority,geographical_subdivision,heading,category,description,hyperlink,Flag,Source
20,20,Criminal and Civil Enforcement,"June 19, 2008",,Middle District of PA,U.S. Marshal-Led Efforts Result in 181 Arrests...,,,http://www.usmarshals.gov/falcon08/news_releas...,1,2008
21,21,Criminal and Civil Enforcement,"June 18, 2008",City of New York Department of Investigation,City of New York,Joint Investigation by DOI and Federal Authori...,,,http://www.nyc.gov/html/doi/pdf/49habib.leroy....,1,2008


In [44]:
exportcsv_cleaned(df, filename)


Exported file: Cleaned/OIG_HHS_Scrape_2008_cleaned.csv


## 2004 - 2007 Data cleaning

In [45]:
for year in range(2004,2008):
    filename = 'OIG_HHS_Scrape_' + str(year)
    df = pandas.read_csv('Raw/'+filename + '.csv') 

    # # Rename columns for further command processing
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

    ## Locate the news (Not necessary the same as geographical_subdivision)
    df['Flag'] = (df['description'].str.find(',')>=0)
    df['news_location'] = df['description'].apply(lambda x: x.split(',')[0] if x.startswith('In ') or x.startswith('n ') else '') 

    df['news_location'] = df['news_location'].str.replace('In ','')
    df['news_location'] = df['news_location'].apply(lambda x: x.replace('n ','') if x.startswith('n ') else x)

    
    #re-order the position of columns
    #cols = df.columns.tolist()    # To extract list of columns
    cols_index = ['unnamed:_0', 'section', 'date', 'authority', 'geographical_subdivision', 'heading', 'category', 'news_location',
                'description', 'hyperlink', 'Flag']
    df = df[cols_index]
    
    
    # Add source of year (verify if year match with the date)
    df['Source'] = year
    
    # Export to CSV using self-written function
    exportcsv_cleaned(df, filename)



Exported file: Cleaned/OIG_HHS_Scrape_2004_cleaned.csv
Exported file: Cleaned/OIG_HHS_Scrape_2005_cleaned.csv
Exported file: Cleaned/OIG_HHS_Scrape_2006_cleaned.csv
Exported file: Cleaned/OIG_HHS_Scrape_2007_cleaned.csv
