In [1]:
import geograpy
import pandas
import numpy as np
import re

## Check if hyperlink exists

In [2]:
import requests
import urllib

def test_hyperlink(link):
    if link[0:4] == "http":
        try:
            request=urllib.request.urlopen(link)
            if request.code == 200:    #Website exists
                return 1
            else:                             #Website does not work
                return 0
        except urllib.error.HTTPError as e:
            return 0
        except urllib.error.URLError as e:
            return 0

## Self-defined function for extracting geographical location & money involved from news description

In [3]:
## Try several versions of location extraction


def simple_location_extract(desc):
    if (desc.find('-')>0) & (desc.find('-')<18):
        return desc.split('-')[0]
    else:
        return ''

In [4]:
## Try location scraper

def geo_cities_extract(desc):
     return geograpy.get_place_context(text = desc).cities

In [5]:
def geo_regions_extract(desc):
     return geograpy.get_place_context(text = desc).regions

In [6]:
## Try place scraper

def geo_places_extract(desc):
     return geograpy.get_place_context(text = desc).places

In [7]:
# Extract money involved or punishment amount

def money_involved(desc):
    regex = r"(?:[\£\$\€]{1}[,\d]+.?\d*)"  
    list_of_units = ['thousand', 'million', 'billion', 'Thousand', 'Million', 'Billion', 'M']    
    
    matches = re.finditer(regex, desc)
    
    money_array = []
    for x in matches:
        startpt = x.start()
        endpt = x.end()
        money = str(x.group())
        
        money = re.sub('[^0-9a-zA-Z\.]+', '', money)
        
        # Add back quantifying units such as 'thousand'/'million/ etc'
        for unit in list_of_units:
            if len(desc[endpt:]) >= len(unit):
                if (desc[endpt:endpt+len(unit)] == unit) or (desc[endpt+1:endpt+len(unit)+1] == unit) :
                    if desc[endpt+1:endpt+len(unit)+1] == unit:
                        money = money + ' '+unit
                    else:
                        money = money + unit
        
        money = money.replace('MillionM', 'Million')
        money = money.replace('Million M', 'Million')
        money_array.append(money)
    
    if len(money_array)==1:        # Only one element - to be stored directly
        return money_array[0]
    elif len(money_array)==0:      # None exist
        return None
    else:
        return money_array

# Actual practice

In [9]:
import datetime
current_year = datetime.datetime.now().year


#for year in range(2019, current_year+1):

for year in range(current_year, current_year+1):    
    filename = 'OIG_HHS_Scrape_' + str(year)+'_cleaned'
    df = pandas.read_csv('Cleaned/Intermediate/'+filename + '.csv') 
    
    #From description
    df['Description_Location'] = df['description'].apply(lambda x:  simple_location_extract(x) if  (type(x)==str) &  (len(str(x))>0) else x)
    #df['Description_Places'] = df['description'].apply(lambda x:  geo_places_extract(x) if  (type(x)==str) &  (len(str(x))>0) else x)
    #df['Descrption_Regions'] = df['description'].apply(lambda x:  geo_regions_extract(x) if  (type(x)==str) &  (len(str(x))>0) else x)
    #df['Description_Cities'] = df['description'].apply(lambda x:  geo_cities_extract(x) if  (type(x)==str) &  (len(str(x))>0) else x)
    df['Description_Money_involved'] = df['description'].apply(lambda x:  money_involved(x) if  (type(x)==str) &  (len(str(x))>0) else x)
    
    
    #From title
    df['Heading_Location'] = df['heading'].apply(lambda x:  simple_location_extract(x) if  (type(x)==str) &  (len(str(x))>0) else x)
    #df['Heading_Places'] = df['heading'].apply(lambda x:  geo_places_extract(x) if  (type(x)==str) &  (len(str(x))>0) else x)
    #df['Heading_Regions'] = df['heading'].apply(lambda x:  geo_regions_extract(x) if  (type(x)==str) &  (len(str(x))>0) else x)
    #df['Heading_Cities'] = df['heading'].apply(lambda x:  geo_cities_extract(x) if  (type(x)==str) &  (len(str(x))>0) else x)
    df['Heading_Money_involved'] = df['heading'].apply(lambda x:  money_involved(x) if  (type(x)==str) &  (len(str(x))>0) else x)
    
    
    #Test hyperlink
    df['hyperlink_valid'] = df['hyperlink'].apply(lambda x:  test_hyperlink(x) if  (type(x)==str) &  (len(str(x))>0) else -999)

    
    #df.to_excel('Cleaned/' + filename +'_v2.xlsx', sheet_name='Sheet1')
    df.to_excel('Cleaned/' + filename +'_v2.xlsx', sheet_name='Sheet1', index = False)
    print(str(year)+' completed')
    

2019 completed
2020 completed
