In [1]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes transcript data from Wdam7 news portal
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="text-align-left card-content").find_all('h1')]
    print(url)
    return text

# URLs of News in scope
urls = ['https://www.wdam.com/2019/10/25/skimming-device-found-pump-petal-gas-station/',
       'https://www.wdam.com/2019/10/23/spotlighting-gangs-south-mississippi/',
       'https://www.wdam.com/2019/10/22/illegal-deer-hunt-wayne-county-leads-attempted-murder-charges/',
       'https://www.wdam.com/2019/10/21/single-punch-leaves-alabama-teen-critical-condition/',
       'https://www.wdam.com/2019/10/21/laurel-police-investigating-weekend-drive-by-shooting/',
       'https://www.wdam.com/2019/10/21/jones-co-man-who-dealt-drugs-near-childrens-hospital-pleads-guilty-drug-conspiracy-charges/',
       'https://www.wdam.com/2019/10/18/relatives-say-mother-acted-suspiciously-before-abducting-children-copiah-county-home-gunpoint/',
       'https://www.wdam.com/2019/10/17/amber-alert-cancelled-after-children-kidnapped-copiah-co-home-found-safe/',
       'https://www.wdam.com/2019/10/14/pine-belt-men-arrested-child-pornography-charges/',
       'https://www.wdam.com/2019/10/10/arrested-after-meth-found-during-traffic-stop-perry-county/',
       'https://www.wdam.com/2019/10/09/lumberton-honors-victims-survivors-domestic-violence/',
       'https://www.wdam.com/2019/10/11/manhunt-underway-escaped-inmate-jones-county/',
       'https://www.wdam.com/2019/10/13/hattiesburg-police-looking-suspect-grand-larceny-case/',
       'https://www.wdam.com/2019/10/27/superhero-potter-fortnite-costumes-big-sellers-hattiesburg-costume-store/',
       'https://www.wdam.com/2019/10/26/hattiesburg-volunteers-participate-make-difference-day/']

# Unique Id of the news articles
newsIndex = ['M1','M2','M3','M4','M5','M6','M7','M8','M9','M10','M11','M12','M13','M14','M15']

In [2]:
# Actually Requesting transcripts 
transcripts = [url_to_transcript(u) for u in urls]

https://www.wdam.com/2019/10/25/skimming-device-found-pump-petal-gas-station/
https://www.wdam.com/2019/10/23/spotlighting-gangs-south-mississippi/
https://www.wdam.com/2019/10/22/illegal-deer-hunt-wayne-county-leads-attempted-murder-charges/
https://www.wdam.com/2019/10/21/single-punch-leaves-alabama-teen-critical-condition/
https://www.wdam.com/2019/10/21/laurel-police-investigating-weekend-drive-by-shooting/
https://www.wdam.com/2019/10/21/jones-co-man-who-dealt-drugs-near-childrens-hospital-pleads-guilty-drug-conspiracy-charges/
https://www.wdam.com/2019/10/18/relatives-say-mother-acted-suspiciously-before-abducting-children-copiah-county-home-gunpoint/
https://www.wdam.com/2019/10/17/amber-alert-cancelled-after-children-kidnapped-copiah-co-home-found-safe/
https://www.wdam.com/2019/10/14/pine-belt-men-arrested-child-pornography-charges/
https://www.wdam.com/2019/10/10/arrested-after-meth-found-during-traffic-stop-perry-county/
https://www.wdam.com/2019/10/09/lumberton-honors-victi

In [3]:
# # Pickle files for later use

# # Make a new directory to hold the text files
!mkdir transcripts

for i, c in enumerate(newsIndex):
    with open("transcripts/" + c + ".txt", "wb") as file:
         pickle.dump(transcripts[i], file)

mkdir: transcripts: File exists


In [4]:
# Load pickled files
data = {}
for i, c in enumerate(newsIndex):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [5]:
# Double check to make sure data has been loaded properly
data.keys()

dict_keys(['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'M10', 'M11', 'M12', 'M13', 'M14', 'M15'])

In [6]:
# We are going to change this to key: news Id, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [7]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [8]:
# Putting it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',200)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
M1,Skimming device found on pump at Petal gas station
M10,2 arrested after meth found during traffic stop in Perry County
M11,"Lumberton honors victims, survivors of domestic violence"
M12,Manhunt underway for escaped inmate in Jones County
M13,Hattiesburg police looking for suspect in grand larceny case
M14,"Superhero, Potter, Fortnite costumes big sellers at Hattiesburg costume store"
M15,Hattiesburg volunteers participate in Make a Difference Day
M2,Spotlighting gangs in South Mississippi
M3,Illegal deer hunt in Wayne County leads to attempted murder charges
M4,Single punch leaves Alabama teen in critical condition


In [9]:
# Imports the Google Cloud client library
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/coolnabinn/Desktop/Edx/projects/calHacks/CalHacks6-ed923f84ad31.json"

from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

# Instantiates a client
client = language.LanguageServiceClient()

In [10]:
#extracting location from the news headline using GOOGLE CLOUD API
def locExt(i):
    document=language.types.Document(
        content=data_df.transcript.loc[i],
        type=language.enums.Document.Type.PLAIN_TEXT,
    )
    response=client.analyze_entities(
        document=document,
        encoding_type='UTF32',
    )
    for entity in response.entities: 
        #print('=' * 20)
        #print('         name: {0}'.format(entity.name))
        #print('         type: {0}'.format(entity.type))
        #print('     metadata: {0}'.format(entity.metadata))
        #print('     salience: {0}'.format(entity.salience))
        #print(entity)
        if entity.type==2: 
            print(entity.name)
            return entity.name

In [11]:
#printing the extracted location of incident
location={}
for i in data: 
    location[i]=locExt(i)

gas station
South Mississippi
Wayne County
Alabama
Laurel
hospital
home
Copiah Co.
Pine Belt
traffic stop
Lumberton
Jones County
Hattiesburg
costume store
Hattiesburg


In [12]:
# printing the dictionary with key and location
print(location)

{'M1': 'gas station', 'M2': 'South Mississippi', 'M3': 'Wayne County', 'M4': 'Alabama', 'M5': 'Laurel', 'M6': 'hospital', 'M7': 'home', 'M8': 'Copiah Co.', 'M9': 'Pine Belt', 'M10': 'traffic stop', 'M11': 'Lumberton', 'M12': 'Jones County', 'M13': 'Hattiesburg', 'M14': 'costume store', 'M15': 'Hattiesburg'}


In [13]:
#building a dictionary and encoding the severity of crime index
#severity scores are choosen in accordance with the criminal activities affecting the 
severDict = {'murder':0.9,'kill':0.9,'terror':0.9,'shoot':0.9,
             'rob':0.8,'rape':0.8,'collaps':0.75,'collide':0.75,'accident':0.75,
             'destroy':0.75,'robbery':0.75,'steal':0.65,'missing':0.3,'lost':'0.3','bully':0.35,
             'bully':0.35,'beat':0.65,'gangs':0.4,'hunt':0.4,'punch':0.2,'critical':0.6,'drugs':0.7,'conspiracy':0.65,
             'suspicious':0.55,'abduct':0.5,'kidnap':0.75,'pornography':0.8,
             'arrest':0.3,'meth':0.7,'violence':0.4,'manhunt':0.6,'racism':0.4,'excape':0.45,
             'suspect':0.4,'device':0.2
            }

In [14]:
# using NLP tools to extract the incident of crime and the severity
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

#extracting the word root to compare against the dictonary word
ps =PorterStemmer()

def severIndx(text):
    sum=0
    count=0
    newsTitle=data_df.transcript.loc[text]
    #print(newsTitle)
    for word in newsTitle.split():
        #print(word)
        for j in severDict:
            if word.lower() ==j or ps.stem(word.lower()) == j:
                count+=1
                sum+=severDict[j]
    if count!=0:
        return sum/count
    else:
        return 0

In [15]:
# building a new dictionary with the severity index for the key-categorized city
wordInContext={}
for i in data:
    wordInContext[i]=severIndx(i)
    #wordInContext[i]=severIndx(i)

In [16]:
#printing the dictionary with key and severity index
print(wordInContext)

{'M1': 0.2, 'M2': 0.4, 'M3': 0.65, 'M4': 0.4, 'M5': 0.9, 'M6': 0.675, 'M7': 0.5, 'M8': 0.75, 'M9': 0.55, 'M10': 0.5, 'M11': 0.4, 'M12': 0.6, 'M13': 0.4, 'M14': 0, 'M15': 0}


In [24]:
# merging the location and severity dectionaries with similar key
ds = [location, wordInContext]
d = {}
for k in location.keys():
  d[k] = tuple(d[k] for d in ds)

In [25]:
# printing the merged dictionary
print(d)

{'M1': ('gas station', 0.2), 'M2': ('South Mississippi', 0.4), 'M3': ('Wayne County', 0.65), 'M4': ('Alabama', 0.4), 'M5': ('Laurel', 0.9), 'M6': ('hospital', 0.675), 'M7': ('home', 0.5), 'M8': ('Copiah Co.', 0.75), 'M9': ('Pine Belt', 0.55), 'M10': ('traffic stop', 0.5), 'M11': ('Lumberton', 0.4), 'M12': ('Jones County', 0.6), 'M13': ('Hattiesburg', 0.4), 'M14': ('costume store', 0), 'M15': ('Hattiesburg', 0)}


In [26]:
#using GOOGLE API to encode the locations to corresponding lattitudes and longitudes
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='myapplication')
latLong={}
for i in d: 
        location = geolocator.geocode(d[i][0])
        try:
            latLong[i]=(location.latitude, location.longitude,d[i][0],d[i][1])
        except GeocoderTimedOut:
            latLong[i]=(0,0, d[i][0],d[i][1])


In [27]:
# dumping the merged dictionary to jason file
import json
app_json = json.dumps(latLong)
print(app_json)
with open('app.json', 'w') as fp:
    json.dump(latLong, fp)

{"M1": [17.6138859, 121.69574807689551, "gas station", 0.2], "M2": [32.9715645, -89.7348497, "South Mississippi", 0.4], "M3": [38.4251958, -88.4197678, "Wayne County", 0.65], "M4": [33.2588817, -86.8295337, "Alabama", 0.4], "M5": [31.739966000000003, -89.13265868703502, "Laurel", 0.9], "M6": [42.7051669, -7.1013033, "hospital", 0.675], "M7": [51.8277408, 9.0207583, "home", 0.5], "M8": [31.8563532, -90.4798717, "Copiah Co.", 0.75], "M9": [35.8989818, 14.5136759, "Pine Belt", 0.55], "M10": [52.9191782, -1.4708665, "traffic stop", 0.5], "M11": [34.6183433, -79.0083993, "Lumberton", 0.4], "M12": [32.7135946, -99.8606106, "Jones County", 0.6], "M13": [31.3271189, -89.2903392, "Hattiesburg", 0.4], "M14": [51.5225546, -0.26001242770184324, "costume store", 0], "M15": [31.3271189, -89.2903392, "Hattiesburg", 0]}
