# Medical insurance fraud case scraping 
## Criminal and Civil Enforcement (Archived, 2009-2019)

The objective of this notebook is to scrape the archived medical fraud court cases reported in the United States Office of Inspector General pages (An example of year 2019 here: https://oig.hhs.gov/reports-and-publications/archives/enforcement/criminal/criminal_archive_2019.asp). This notebook is supposed to be run once only.

For updates of the current cases reported, there is another notebook titled "OIG at HHS Data scraping - Criminal and Civil Enforcement - Regular maintainence.ipynb" which works on regular updates of the database.

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, NavigableString, Tag
import pandas as pd
import re

In [2]:
def get_text_with_br(tag, result=''):
    for x in tag.contents:
        if isinstance(x, Tag):  # check if content is a tag
            if x.name == 'br':  # if tag is <br> append it as string
                result += str(x)
            else:  # for any other tag, recurse
                result = get_text_with_br(x, result)
        else:  # if content is NavigableString (string), append
            result += x

    return result

## For years from 2012 - 2019 (Past years, archived)

In [3]:
current_year = 2020

section = 'Criminal and Civil Enforcement'
count = 0

for year in range(2009, current_year):
    OIG_df_section = []
    OIG_df_date = []
    OIG_df_dept = []
    OIG_df_geog_subdivision = []
    OIG_df_MFStrikeForce_tag = []
    OIG_df_heading = []
    OIG_df_description = []
    OIG_df_hyperlink = []    
    
    year_str = str(year)
    url = 'https://oig.hhs.gov/reports-and-publications/archives/enforcement/criminal/criminal_archive_'+year_str+'.asp'
    
    current_DB = urlopen(url)
    res = BeautifulSoup(current_DB, "html.parser")


    info = res.findAll('dl', {'class':'criminal_report'})

    ######################################

    for count in range(0, len(info)):
        for i in info[count].findAll('dt'):
            if year >= 2012:
                if i.find('img', {'alt':'Medicare Fraud Strike Force Case'})!= None:     # Marked as Medicare Fraud Strike Force case
                    OIG_df_MFStrikeForce_tag.append(1)
                else:
                    OIG_df_MFStrikeForce_tag.append(0)
            else:   # If year earlier than 2012
                OIG_df_MFStrikeForce_tag.append(-999)
            
            # Extract heading information
            Information = i.get_text().strip().split("; ")
            while len(Information) <3:
                Information.append('')
            OIG_df_section.append(section)
            OIG_df_date.append(Information[0])      
            OIG_df_dept.append(Information[1])
            OIG_df_geog_subdivision.append(Information[2])

        
        for j in info[count].findAll('dd'):
            if (re.search('Related: OIG Captured Fugitive', j.get_text()) == None):
                ## Related: OIG Captured Fugitive - extended piece under the same case, but separated into two cases in the html structure
                hyperlink_set = ''
                for link in j.findAll('a'):
                    h = link.get('href')
                    if len(hyperlink_set) == 0:
                        hyperlink_set = hyperlink_set + h
                    else:
                        hyperlink_set = hyperlink_set + ', ' + h
                OIG_df_hyperlink.append(hyperlink_set)

                j = get_text_with_br(j)  # Keep the <br/> tag in the scraped html code
                locate = j.find("<br/>")
                if locate>=0:
                    Heading = j[0: locate]
                    Description = j[locate+5:]
                else:
                    Heading = j
                    Description = ''

                replace_list = ['\r', '\n', '\t']
                for char in replace_list:
                    Heading = Heading.replace(char ,'')
                    Description = Description.replace(char,'')            
                OIG_df_heading.append(Heading)
                OIG_df_description.append(Description)            
                
    ## Export to data frame
    df_info = pd.DataFrame()
    df_info['Section'] = OIG_df_section
    df_info['Date'] = OIG_df_date
    df_info['Authority'] = OIG_df_dept
    df_info['Geographical subdivision'] = OIG_df_geog_subdivision
    df_info['Medicare Fraud Strike Force case'] = OIG_df_MFStrikeForce_tag 
    df_info['Heading'] = OIG_df_heading
    df_info['Description'] = OIG_df_description
    df_info['Hyperlink'] = OIG_df_hyperlink

    ## Export DF to CSV
    filename = 'OIG_HHS_Scrape_'+year_str+'_raw.csv'
    df_info.to_csv('raw/'+filename)

    print(year, ' completed.')

2009  completed.
2010  completed.
2011  completed.
2012  completed.
2013  completed.
2014  completed.
2015  completed.
2016  completed.
2017  completed.
2018  completed.
2019  completed.


In [4]:
## Showing one year of data as an example
df_info.head(4)

Unnamed: 0,Section,Date,Authority,Geographical subdivision,Medicare Fraud Strike Force case,Heading,Description,Hyperlink
0,Criminal and Civil Enforcement,"December 30, 2019",U.S. Attorney,Middle District of Tennessee,0,Largest Independent Provider Of Intraoperative...,,https://www.justice.gov/usao-mdtn/pr/largest-i...
1,Criminal and Civil Enforcement,"December 20, 2019",U.S. Attorney,Southern District of Florida,0,Husband and Wife Sentenced to Prison for Roles...,,https://www.justice.gov/usao-sdfl/pr/husband-a...
2,Criminal and Civil Enforcement,"December 20, 2019",U.S. Attorney,Western District of Texas,0,Federal Authorities Arrest Owner and Chief Ope...,"In San Antonio, a federal grand jury indicted ...",https://www.justice.gov/usao-wdtx/pr/federal-a...
3,Criminal and Civil Enforcement,"December 20, 2019",U.S. Attorney,Eastern District of Wisconsin,0,Physician Group and Related Company Agree to R...,United States Attorney Matthew D. Krueger anno...,https://www.justice.gov/usao-edwi/pr/physician...
