In [1]:
import geograpy
import pandas
import numpy as np
import re

## Check if hyperlink exists

In [2]:
import requests
import urllib

def test_hyperlink(link):
    link = link.encode('ascii', 'ignore').decode('ascii')

    if link[0:4] == "http":
        try:
            request=urllib.request.urlopen(link)
            if request.code == 200:    #Website exists
                return 1
            else:                             #Website does not work
                return 0
        except urllib.error.HTTPError as e:
            return 0
        except urllib.error.URLError as e:
            return 0

## Self-defined function for extracting geographical location & money involved from news description

In [3]:
## Try several versions of location extraction

def simple_location_extract(desc):
    if (desc.find('-')>0) & (desc.find('-')<18):
        return desc.split('-')[0]
    else:
        return ''

In [4]:
def state_extract(desc):
    state_list = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", 
                  "Florida", "Georgia", "Hawaii",  "Idaho",  "Illinois",  "Indiana",  "Iowa",
                  "Kansas",  "Kentucky",  "Louisiana",  "Maine",  "Maryland",  "Massachusetts",
                  "Michigan",  "Minnesota",  "Mississippi",  "Missouri",  "Montana", "Nebraska", "Nevada",
                  "New Hampshire",  "New Jersey",  "New Mexico", "New York", "North Carolina", "North Dakota",
                  "Ohio",  "Oklahoma",  "Oregon",  "Pennsylvania",  "Rhode Island",  "South Carolina",  "South Dakota",
                  "Tennessee",  "Texas",  "Utah",  "Vermont",  "Virginia", "Washington",  "West Virginia"
                  "Wisconsin",  "Wyoming", "Puerto Rico"]
    return_state = ''
    
    for state in state_list:
        if (desc.find(state)>-1) & (return_state==''):
            return_state = state
    
    return return_state

In [5]:
## Try location scraper

def geo_cities_extract(desc):
     return geograpy.get_place_context(text = desc).cities

In [6]:
def geo_regions_extract(desc):
     return geograpy.get_place_context(text = desc).regions

In [7]:
## Try place scraper

def geo_places_extract(desc):
     return geograpy.get_place_context(text = desc).places

In [8]:
# Extract money involved or punishment amount

def money_involved(desc):
    regex = r"(?:[\£\$\€]{1}[,\d]+.?\d*)"  
    list_of_units = ['thousand', 'million', 'billion', 'Thousand', 'Million', 'Billion', 'M']    
    
    matches = re.finditer(regex, desc)
    
    money_array = []
    for x in matches:
        startpt = x.start()
        endpt = x.end()
        money = str(x.group())
        
        money = re.sub('[^0-9a-zA-Z\.]+', '', money)
        
        # Add back quantifying units such as 'thousand'/'million/ etc'
        for unit in list_of_units:
            if len(desc[endpt:]) >= len(unit):
                if (desc[endpt:endpt+len(unit)] == unit) or (desc[endpt+1:endpt+len(unit)+1] == unit) :
                    if desc[endpt+1:endpt+len(unit)+1] == unit:
                        money = money + ' '+unit
                    else:
                        money = money + unit
        
        money = money.replace('MillionM', 'Million')
        money = money.replace('Million M', 'Million')
        money_array.append(money)
    
    if len(money_array)==1:        # Only one element - to be stored directly
        return money_array[0]
    elif len(money_array)==0:      # None exist
        return None
    else:
        return money_array

# Actual practice

In [9]:
filename = 'OIG_HHS_Scrape_allyears_intermediate.csv'
df = pandas.read_csv('Raw/'+filename) 

#From description
df['Description_Location'] = df['description'].apply(lambda x:  simple_location_extract(x) if  (type(x)==str) &  (len(str(x))>0) else x)
df['Description_State'] = df['description'].apply(lambda x:  state_extract(x) if  (type(x)==str) &  (len(str(x))>0) else x)
df['Description_Money_involved'] = df['description'].apply(lambda x:  money_involved(x) if  (type(x)==str) &  (len(str(x))>0) else x)
print('Desc done')

#From title
df['Heading_Location'] = df['heading'].apply(lambda x:  simple_location_extract(x) if  (type(x)==str) &  (len(str(x))>0) else x)
df['Heading_State'] = df['heading'].apply(lambda x:  state_extract(x) if  (type(x)==str) &  (len(str(x))>0) else x)
df['Heading_Money_involved'] = df['heading'].apply(lambda x:  money_involved(x) if  (type(x)==str) &  (len(str(x))>0) else x)
print('Title done')

#Test hyperlink
df['hyperlink_valid'] = df['hyperlink'].apply(lambda x:  test_hyperlink(x) if  (type(x)==str) &  (len(str(x))>0) else -999)

#df.to_excel('Cleaned/' + filename +'_v2.xlsx', sheet_name='Sheet1')
df.to_excel('Cleaned/OIG_HHS_Scrape_allyears.xlsx', sheet_name='Sheet1', index = False)

Desc done
Title done


  force_unicode(url))
  force_unicode(url))


In [12]:
# Re-direct completed dataset to csv instead 
df.to_csv('Cleaned/OIG_HHS_Scrape_allyears.csv', index = False)