# Medical insurance fraud case scraping 
## Criminal and Civil Enforcement (Current update)

The objective of this notebook is to scrape the current medical fraud court cases reported in the United States Office of Inspector General index page (https://oig.hhs.gov/fraud/enforcement/criminal/index.as). This notebook is supposed to be run repeatedly.

## Initialization

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, NavigableString, Tag
import pandas as pd
import re
import os

from datetime import datetime
from datetime import date

In [2]:
# Extract the description part including <br>, as sometimes the description has more than one paragraph

def get_text_with_br(tag, result=''):
    for x in tag.contents:
        if isinstance(x, Tag):  # check if content is a tag
            if x.name == 'br':  # if tag is <br> append it as string
                result += str(x)
            else:  # for any other tag, recurse
                result = get_text_with_br(x, result)
        else:  # if content is NavigableString (string), append
            result += x

    return result

In [3]:
# Define directories
section = 'Criminal and Civil Enforcement'
raw_data_directory = 'Raw/'

## Extract the current enforcement action news and store in a panda dataframe for further mainpulation

In [4]:
# Initialize the dataframe setting (Independent arrays)

section = 'Criminal and Civil Enforcement'
count = 0

OIG_df_section = []
OIG_df_date = []
OIG_df_dept = []
OIG_df_geog_subdivision = []
OIG_df_MFStrikeForce_tag = []
OIG_df_heading = []
OIG_df_description = []
OIG_df_hyperlink = []    

In [None]:
# Scrape the page for the most recent enforcement news

url = "https://oig.hhs.gov/fraud/enforcement/criminal/index.asp"    #Page for most current news
current_DB = urlopen(url)
res = BeautifulSoup(current_DB, "html.parser")

In [6]:
# Process the data. Classify into respective column arrays

info = res.findAll('dl', {'class':'criminal_report'})

for count in range(0, len(info)):
    for i in info[count].findAll('dt'):
        if i.find('img', {'alt':'Medicare Fraud Strike Force Case'})!= None:     # Marked as Medicare Fraud Strike Force case
            OIG_df_MFStrikeForce_tag.append(1)
        else:
            OIG_df_MFStrikeForce_tag.append(0)

        # Extract heading information
        Information = i.get_text().strip().split("; ")
        while len(Information) <3:
            Information.append('')
        OIG_df_section.append(section)
        
        
        #Pre-processing of heading information
        daylist = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday','Friday', 'Saturday']
        for day in daylist:
            replace_string = day + ', '
            Information[0]=Information[0].replace(replace_string, '')

        
        #If missing date due to combined heading (e.g. One day with two reports, same heading)
        if Information[0]=='':
            Information[0] = OIG_df_date[len(OIG_df_date)-1]  # The previous record
        
        # Handle missing year problem
        year_search = re.search(', [0-9][0-9][0-9][0-9]', Information[0])
        if year_search is None:
            currentYear = datetime.now().year
            Information[0] = Information[0] + ', ' + str(currentYear)
        
        # Append date, department, geog subdivision
        if len(Information[0].split(','))!=3:
            OIG_df_date.append(Information[0])      
            OIG_df_dept.append(Information[1])
            OIG_df_geog_subdivision.append(Information[2])
        else:
            OIG_df_date.append(Information[0].split(',')[0] + ',' + Information[0].split(',')[1])      
            OIG_df_dept.append(Information[0].split(',')[2])
            OIG_df_geog_subdivision.append(Information[2])
        
        
        
            
    for j in info[count].findAll('dd'):
        hyperlink_set = ''
        for link in j.findAll('a'):
            h = link.get('href')
            if len(hyperlink_set) == 0:
                hyperlink_set = hyperlink_set + h
            else:
                hyperlink_set = hyperlink_set + ', ' + h
        OIG_df_hyperlink.append(hyperlink_set)
        
        
        j = get_text_with_br(j)  # Keep the <br/> tag in the scraped html code
        locate = j.find("<br")
        if locate>=0:
            Heading = j[0: locate]
            Description = j[locate+5:]
            
            replace_list = ['\r', '\n', '\t']
            for char in replace_list:
                Heading = Heading.replace(char ,'')
                Description = Description.replace(char,'')     
    
        else:
            Heading = j
            Description = ''

        OIG_df_heading.append(Heading)
        OIG_df_description.append(Description)
        #print(len(OIG_df_heading), len(OIG_df_date))

In [7]:
## Export to data frame that can be further manipulated
df_info = pd.DataFrame()
df_info['Section'] = OIG_df_section
df_info['Date'] = OIG_df_date
df_info['Authority'] = OIG_df_dept
df_info['Geographical subdivision'] = OIG_df_geog_subdivision
df_info['Medicare Fraud Strike Force case'] = OIG_df_MFStrikeForce_tag 
df_info['Heading'] = OIG_df_heading
df_info['Description'] = OIG_df_description
df_info['Hyperlink'] = OIG_df_hyperlink

In [8]:
## Before further formatting, etc. checking the most recent records to see if the extraction has any problem, 
## like whether the format has changed, etc.
df_info.head(30)


Unnamed: 0,Section,Date,Authority,Geographical subdivision,Medicare Fraud Strike Force case,Heading,Description,Hyperlink
0,Criminal and Civil Enforcement,"August 27, 2020","U.S. Attorney's Office, District of New Jersey",,0,New Jersey Electronic Health Records Company t...,"NEWARK, N.J. - An electronic health records co...",https://go.usa.gov/xGCxj
1,Criminal and Civil Enforcement,"August 26, 2020","U.S. Attorney's Office, District of New Jersey",,0,Camden County Man Admits Role in Government Be...,"CAMDEN, N.J. - A Camden, New Jersey, man today...",https://go.usa.gov/xGxun
2,Criminal and Civil Enforcement,"August 26, 2020","U.S. Attorney's Office, District of New Jersey",,0,Fourth Person Admits Trafficking High-Dosage O...,"CAMDEN, N.J. - A Camden County, New Jersey, ma...",https://go.usa.gov/xGxJa
3,Criminal and Civil Enforcement,"August 25, 2020","U.S. Attorney's Office, Eastern District of Ke...",,0,Floyd County Dentist Pleads Guilty to Health C...,"FRANKFORT, Ky.- A McDowell, Ky., man, Denver D...",https://go.usa.gov/xGcfP
4,Criminal and Civil Enforcement,"August 25, 2020","U.S. Attorney's Office, Southern District of T...",,0,Pain doctor pays to settle allegations of dece...,HOUSTON - A 52-year-old pain management physic...,https://go.usa.gov/xGcft
5,Criminal and Civil Enforcement,"August 24, 2020",Department of Justice,,0,DUSA Pharmaceuticals To Pay U.S. $20.75 Millio...,"Massachusetts-based DUSA Pharmaceuticals, Inc....",https://go.usa.gov/xG3UQ
6,Criminal and Civil Enforcement,"August 21, 2020","U.S. Attorney's Office, Southern District of G...",,0,Government obtains more than $5 million in jud...,"BRUNSWICK, GA: A Brunswick, Ga. chiropractor ...",https://go.usa.gov/xGqUN
7,Criminal and Civil Enforcement,"August 21, 2020","U.S. Attorney's Office, Eastern District of Ne...",,0,Former Queens Cardiologist Settles Civil Fraud...,"Ghanshyam Bhambhani, a former Queens cardiolog...",https://go.usa.gov/xGqUs
8,Criminal and Civil Enforcement,"August 20, 2020",U.S. Attorney's Office,Northern District of Texas,0,NextHealth Marketer Charged in $60 Million Kic...,A pharmacy marketer who allegedly collected mo...,https://go.usa.gov/xfSaX
9,Criminal and Civil Enforcement,"August 19, 2020","U.S. Attorney's Office, Northern District of W...",,0,West Virginia physician indicted for illegally...,"CLARKSBURG, WEST VIRGINIA - Dr. Felix Brizuela...",https://go.usa.gov/xfuS9


In [9]:
## Date formatting
df_info['Date'] = pd.to_datetime(df_info['Date'])  

## Drop the irrelevant enforcement news (e.g. already appears before, or not in appropriate time range); and update the text file recording the most recent date in the existing datasets.

In [10]:
# File storing most recent date
most_recent_date_file = section + ' - most recent.txt'

In [11]:
## Read what is the most recent date
with open(raw_data_directory + most_recent_date_file) as file_most_recent:
    head = file_most_recent.readlines()[0:1]    ## Only store the first line
most_recent_date = head[0]
print(most_recent_date)

## Drop if the new news collected is earlier than the most recent date in the existing dataset
df_info = df_info[df_info['Date']>most_recent_date]

2020-01-01


In [12]:
## Update the most recent date in the recording text file 


try:
    date = datetime.strftime(df_info['Date'].max().date(),'%Y-%m-%d')
except:
    date = date.today().strftime('%Y-%m-%d') 

with open(raw_data_directory + most_recent_date_file, "w") as file_most_recent:
    file_most_recent.write(date)    # by replacing the original date

print(date)

2020-08-27


In [13]:
# To check if there is more than one year involved in the new record (e.g. When updating in early January mixing with some records in late December last year)
year_unique_record = df_info['Date'].dt.year.unique() #Unique values of year in the dataset  (e.g.: Maybe a mix of 2018 or 2019)
print(year_unique_record)

for year_pointer in range(0, len(year_unique_record)) :
    year_current = str(year_unique_record[year_pointer])
    if  os.path.isfile(raw_data_directory + 'OIG_HHS_Scrape_' + year_current + '_raw.csv') == True:
        df_info.to_csv(raw_data_directory + 'OIG_HHS_Scrape_' + year_current + '_raw.csv', header=False, mode = 'a')   # Append if already exists
        print( 'OIG_HHS_Scrape_' + year_current + '_raw.csv updated')
    else:
        df_info.to_csv(raw_data_directory + 'OIG_HHS_Scrape_' + year_current + '_raw.csv')  # Create a new files
        print( 'OIG_HHS_Scrape_' + year_current + '_raw.csv created')

[2020]
OIG_HHS_Scrape_2020_raw.csv created
