<a href="https://colab.research.google.com/github/phoid/WebScrapers/blob/main/Colorado_KRDO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Scraping Crime under KRDO news
The scrapper collectes data from KRDO crime news(https://krdo.com/news/crime/)

In [1]:
import requests                # to request the webpage
from bs4 import BeautifulSoup  # to make soup and pull data out of HTML
import urllib.robotparser      # to check the legitimacy to scrap the web
import json                    # to save the output as json file
import pandas as pd            # to  see saved data as dataframe
from datetime import datetime  # to get the current datetime
#import IPython                 # to display the webpage

In [2]:
#from newspaper import Article
#url= "https://krdo.com/news/2021/09/06/pueblo-police-man-arrested-in-connection-to-shooting/"
#article= Article(url)
#article.download()
#article.parse()

### Check permision to scrap the webpage

In [3]:
robotpars = urllib.robotparser.RobotFileParser()  #instantiate the RobotFileParser

#set the robots.txt url of cbs local news
robotpars.set_url("https://krdo.com/robots.txt")
robotpars.read() # Reads the robots.txt

# to check if useragent can fetch the url, true means fetching is possible.
print("Can we fetch the website?", \
      robotpars.can_fetch("*", "https://krdo.com/"))


Can we fetch the website? True


In [4]:
def getSoup(url):
    """make a soup for a webpage given its url"""

    # request the webpage and get the text
    pagetext= requests.get(url).text
    #make a soup and use html parser for the content of the web page
    soup= BeautifulSoup(pagetext, "html.parser")
    return soup

def getNewsLink(soup):
  """Returns a list containing the links of news article.
       soup: is the soup of a webpage from which the links will be extracted.
     """
  news_list=[] # place holder to collect all the links of the news article

  try:
    articles = soup.find_all("h3", attrs={"class": "story__title hdg hdg--4"})
    for artc in articles:
      link= artc.a["href"] # get the link for each news article
      news_list.append(link)
  except:
    pass
  return news_list

In [5]:
url= 'https://krdo.com/news/crime/'
soup= getSoup(url)
getNewsLink(soup)

['https://krdo.com/news/2024/03/06/owner-of-penrose-funeral-home-misses-sobriety-monitoring-cause-hearing-scheduled/',
 'https://krdo.com/news/2024/03/06/remains-of-missing-el-paso-county-woman-found-in-fremont-county-deputies-now-investigating/',
 'https://krdo.com/news/2024/03/06/craftsman-homes-owners-arrested-after-13-investigates-uncovers-millions-of-dollars-of-fraud/',
 'https://krdo.com/news/2024/03/03/pueblo-police-investigating-overnight-shooting-as-a-homicide-after-one-man-found-dead/',
 'https://krdo.com/news/2024/03/01/dv-case-against-cos-woman-convicted-of-dragging-tow-truck-driver-to-death-dismissed/',
 'https://krdo.com/news/2024/02/29/quiet-colorado-springs-neighborhood-shocked-after-being-struck-by-stray-bullets/',
 'https://krdo.com/news/2024/02/28/pueblo-county-couple-charged-with-mistreating-over-20-animals/',
 'https://krdo.com/news/2024/02/21/victim-identified-in-fatal-vehicle-pedestrian-crash-on-north-cascade-avenue/',
 'https://krdo.com/news/2024/02/27/new-alleg

### Collect the data

In [6]:
def getNewsInfo(news_link):
    ''' Returns a dictionary containing infomation about the news such as headline, content and published data.
        news_link: the link from which the information is collected'''

    Url = news_link
    Source = "Article_KRDO"  # the same for all links obtained from the main source

    news_soup= getSoup(news_link) # make a soup

    # Get Headline
    #==============================
    try:  # try if title can be found
        Headline= news_soup.find("h1",attrs={"class":"hdg hdg--3"}).get_text() # get the title .append(title)
    except:
        Headline="NA"

   #Get Published Date and Time
    #=================================
    try:
        PublishedDateTime= news_soup.find("span",attrs={"class":"meta__date-time-updated"}).string  # get the date
    except:
        PublishedDateTime= "NA"


    # Get Content
    #===================================

    text=[] #placeholder to collect contents from multiple paragraphs
    try:
        cont= news_soup.find("div",attrs={"class":"entry__content"})
        for c in cont.find_all("p"): # loop over each paragraph
            #get the text in each paragraph and append them
            text.append(c.get_text(separator=" ", strip=True).replace("\xa0", " "))

        Content=" ".join(text) # concatnate the paragraphs to make a single string

    except:
        Content="NA"

    # collect  all the data as dictionay
    data= {"Source": Source, 'Url': Url, 'PublishedDateTime': PublishedDateTime, 'Headline': Headline, 'Content': Content}

        # gather all companies info
    return data

In [7]:
url= "https://krdo.com/news/crime/" # the page we are going to scrap
soup=getSoup(url)
news_url= getNewsLink(soup) #get list of category links
all_data=[] # place holder to collect all the data


for link in news_url:
    all_data.append(getNewsInfo(link))

data= pd.DataFrame(all_data) # make a dataframe

In [8]:
data

Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content
0,Article_KRDO,https://krdo.com/news/2024/03/06/owner-of-penr...,"March 6, 2024 3:24 PM",Owner of Penrose funeral home misses sobriety ...,"COLORADO SPRINGS, Colo. (KRDO) - One of the ow..."
1,Article_KRDO,https://krdo.com/news/2024/03/06/remains-of-mi...,"March 6, 2024 2:09 PM",Remains of missing El Paso County woman found ...,"FREMONT COUNTY, Colo. (KRDO) -- The El Paso Co..."
2,Article_KRDO,https://krdo.com/news/2024/03/06/craftsman-hom...,"March 6, 2024 10:00 AM",Craftsman Homes owners arrested after 13 Inves...,"EL PASO COUNTY, Colo. (KRDO) -- The El Paso Co..."
3,Article_KRDO,https://krdo.com/news/2024/03/03/pueblo-police...,,Pueblo Police investigating overnight shooting...,"PUEBLO, Colo. (KRDO) -- The Pueblo Police Depa..."
4,Article_KRDO,https://krdo.com/news/2024/03/01/dv-case-again...,"March 1, 2024 6:19 PM",DV case against COS woman convicted of draggin...,"COLORADO SPRINGS, Colo. (KRDO) -- The 2023 dom..."
5,Article_KRDO,https://krdo.com/news/2024/02/29/quiet-colorad...,,‘Quiet’ Colorado Springs neighborhood shocked ...,"COLORADO SPRINGS, Colo. (KRDO) - Families in e..."
6,Article_KRDO,https://krdo.com/news/2024/02/28/pueblo-county...,,Pueblo County couple charged with mistreating ...,"PUEBLO COUNTY, Colo. (KRDO) - A Pueblo County ..."
7,Article_KRDO,https://krdo.com/news/2024/02/21/victim-identi...,"February 28, 2024 12:16 PM",Victim identified in fatal vehicle-pedestrian ...,"COLORADO SPRINGS, Colo. (KRDO) -- The El Paso ..."
8,Article_KRDO,https://krdo.com/news/2024/02/27/new-allegatio...,,New allegations against Colorado Springs gym a...,"COLORADO SPRINGS, Colo. (KRDO) - A former Colo..."
9,Article_KRDO,https://krdo.com/news/2024/02/27/colorado-spri...,"February 27, 2024 11:58 AM",Colorado Springs police investigating after sh...,"COLORADO SPRINGS, Colo. (KRDO) - The Colorado ..."


### Add more features
(Code taken from Manoji and modified a littel bit)

In [9]:
# run this command for every first excecution of the notebook
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [10]:
import spacy
from spacy.lang.en import English
import glob
import os

!pip install googletrans #if you get the missing translate module take out the # before !pip and run it again
from googletrans import Translator



In [11]:
def language_detect(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      a = trans.detect(data).lang
      #b= trans.translate(data, dest='en').text
    except:
      a= 'Unknown'
      #b= 'Translation Failed'

    return a

def language_translate(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      #a = trans.detect(data).lang
      b= trans.translate(data, dest='en').text
    except:
      #a= 'Unknown'
      b= 'Translation Failed'

    return b

def key_search(data, key):
  #print(data)
    if data.lower().find(key.lower()) <0:
      return 0
    else:
      return 1

def keyword_scores(data, keys):
    res = {}
    try:
      for key in keys:
        l = keys[key]
        #print(l)
        res[key] = sum(list( map( lambda x: key_search(data, x), l)))
    except:
      res[key] = 0

    return res

In [12]:
import json, requests, urllib, io

#user='my_github_username'
#pao='my_pao'


github_session = requests.Session()
#github_session.auth = (user, pao)

# providing raw url to download csv from github
csv_url = 'https://raw.githubusercontent.com/AR-github-AWS/testrepo/main/Keywords%20for%20Data%20Science.csv?token=GHSAT0AAAAAABTJTADX3DUWZIYFQ3PFQF34YSZYMOA'

download = github_session.get(csv_url).content
downloaded_csv = pd.read_csv(io.StringIO(download.decode('utf-8')), error_bad_lines=False)
df_keywords = downloaded_csv
df_keywords = df_keywords.astype('str').replace('nan')
keywords = df_keywords.to_dict('LIST')



  downloaded_csv = pd.read_csv(io.StringIO(download.decode('utf-8')), error_bad_lines=False)


In [13]:
def GetFacetCols(dataframe,ColumnName):
    nlp = spacy.load("en_core_web_sm")
    OrgFacet=[]
    PersonFacet=[]
    LocationFacet=[]
    MoneyFacet = []
    LanguageFacet = []
    DateFacet = []
    #for i in range(len(dataframe)):
    for row in dataframe[ColumnName]:

        #data=dataframe[ColumnName][i]
        content = nlp(str(row))
        Org=[]
        Person=[]
        Location=[]
        Money = []
        Language = []
        Date = []

        try:

            for ent in content.ents:  # loop over the entities
                if(ent.label_ == "ORG"):
                    if ent.text not in Org:
                        Org.append(ent.text)
                if(ent.label_ == "PERSON"):
                    if ent.text not in Person:
                        Person.append(ent.text)
                if(ent.label_ == "GPE"):
                    if ent.text not in Location:
                        Location.append(ent.text)
                if(ent.label_ == "MONEY"):
                    if ent.text not in Money:
                        Money.append(ent.text)
                if(ent.label_ == "LANGUAGE"):
                    if ent.text not in Language:
                        Language.append(ent.text)
                if(ent.label_ == "DATE"):
                    if ent.text not in Date:
                        Date.append(ent.text)

        except:
            pass

        finally:
            OrgFacet.append(Org)
            PersonFacet.append(Person)
            LocationFacet.append(Location)
            MoneyFacet.append(Money)
            LanguageFacet.append(Language)
            DateFacet.append(Date)



    dataframe['keyword score'] = list(map( lambda x: sum(keyword_scores(x, keywords).values()), dataframe[ColumnName]))
    dataframe['LanguageFacet'] = list( map( language_detect, dataframe[ColumnName]))
    dataframe['Translated Content'] = list( map( language_translate, dataframe[ColumnName]))
    dataframe['GeoFacet']=LocationFacet
    dataframe['OrgFacet']=OrgFacet
    dataframe['PeopleFacet']=PersonFacet
    dataframe['MoneyFacet'] = MoneyFacet
    dataframe['DateFacet'] = DateFacet

    return dataframe

In [14]:
GetFacetCols(data, 'Content')
# adding empty columns to the data frame
data["Threat"] = " "
data["Useful"] = " "
data["Comment"] = " "

###Data is here

In [15]:
data

Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content,keyword score,LanguageFacet,Translated Content,GeoFacet,OrgFacet,PeopleFacet,MoneyFacet,DateFacet,Threat,Useful,Comment
0,Article_KRDO,https://krdo.com/news/2024/03/06/owner-of-penr...,"March 6, 2024 3:24 PM",Owner of Penrose funeral home misses sobriety ...,"COLORADO SPRINGS, Colo. (KRDO) - One of the ow...",0,Unknown,Translation Failed,"[Colo., Penrose]","[KRDO, Office]","[Carie Hallford, Carie]",[],"[today, March 6, March 14]",,,
1,Article_KRDO,https://krdo.com/news/2024/03/06/remains-of-mi...,"March 6, 2024 2:09 PM",Remains of missing El Paso County woman found ...,"FREMONT COUNTY, Colo. (KRDO) -- The El Paso Co...",0,Unknown,Translation Failed,"[FREMONT COUNTY, Colo., El Paso County, Fremon...",[EPSO],"[Beth Aper, Aper, Sheriff Joseph Roybal, Orr R...",[],"[February 24, 2024, March 4, 2024, nearly a ye...",,,
2,Article_KRDO,https://krdo.com/news/2024/03/06/craftsman-hom...,"March 6, 2024 10:00 AM",Craftsman Homes owners arrested after 13 Inves...,"EL PASO COUNTY, Colo. (KRDO) -- The El Paso Co...",0,Unknown,Translation Failed,"[EL PASO COUNTY, Colo., Monument, the El Paso ...","[Craftsman Homes and Interiors, County Sheriff...","[Dwight, Joni Mulberry, Roger Ewer, Ewers, San...",[more than $2 million],"[Tuesday, last July, the Mulberrys hundreds of...",,,
3,Article_KRDO,https://krdo.com/news/2024/03/03/pueblo-police...,,Pueblo Police investigating overnight shooting...,"PUEBLO, Colo. (KRDO) -- The Pueblo Police Depa...",0,Unknown,Translation Failed,"[Colo., the City of Pueblo's]","[PUEBLO, KRDO, The Pueblo Police Department, C...",[],[],"[2024, Sunday]",,,
4,Article_KRDO,https://krdo.com/news/2024/03/01/dv-case-again...,"March 1, 2024 6:19 PM",DV case against COS woman convicted of draggin...,"COLORADO SPRINGS, Colo. (KRDO) -- The 2023 dom...",0,Unknown,Translation Failed,"[Colo., Colorado Springs, El Paso, El Paso Cou...","[Detra Farries, Colorado Department of Correct...","[Allen Rose, Dan May, Rose, Alan Rose, Jeremy ...",[],"[2023, 2011, Thursday, 20 years, 2021, May, No...",,,
5,Article_KRDO,https://krdo.com/news/2024/02/29/quiet-colorad...,,‘Quiet’ Colorado Springs neighborhood shocked ...,"COLORADO SPRINGS, Colo. (KRDO) - Families in e...",0,Unknown,Translation Failed,"[Colo., Colorado Springs, Cimarron Hills, El P...",[Cases],"[Mike Papes, County Sheriff's]",[],"[Wednesday, 2-year-old, yesterday]",,,
6,Article_KRDO,https://krdo.com/news/2024/02/28/pueblo-county...,,Pueblo County couple charged with mistreating ...,"PUEBLO COUNTY, Colo. (KRDO) - A Pueblo County ...",0,Unknown,Translation Failed,"[PUEBLO COUNTY, Colo., Pueblo County, KRDO13]",[Animal Law Enforcement],"[Elizabeth Barns-Mcdaniel, Justin Latka, Anima...",[],"[June of last year, earlier this month, next m...",,,
7,Article_KRDO,https://krdo.com/news/2024/02/21/victim-identi...,"February 28, 2024 12:16 PM",Victim identified in fatal vehicle-pedestrian ...,"COLORADO SPRINGS, Colo. (KRDO) -- The El Paso ...",0,Unknown,Translation Failed,"[Colo., El Paso, Colorado Springs]","[North Cascade Avenue, Colorado Springs, Cameron]","[County Coroner's, Josephine Cameron, Cameron]",[],"[78-year-old, February 1, 2024, February 6, 20...",,,
8,Article_KRDO,https://krdo.com/news/2024/02/27/new-allegatio...,,New allegations against Colorado Springs gym a...,"COLORADO SPRINGS, Colo. (KRDO) - A former Colo...",0,Unknown,Translation Failed,"[Colo., Colorado Springs, University Village, ...","[North Academy Fitness, the Blue Mountain Crea...","[Forsham Williams Jr., Williams, Michael Van S...","[more than $400,000, 65,000, 150,000, over $10...","[Tuesday, December 2022, that day, 2022, Later...",,,
9,Article_KRDO,https://krdo.com/news/2024/02/27/colorado-spri...,"February 27, 2024 11:58 AM",Colorado Springs police investigating after sh...,"COLORADO SPRINGS, Colo. (KRDO) - The Colorado ...",0,Unknown,Translation Failed,"[Colo., Palmer Park]","[The Colorado Springs Police Department, CSDP,...",[],[],[],,,


#### Save the data

In [16]:
# storing at "output" dir
# take out the # on the 3 lines below to save to drive folders
#date = datetime.now().strftime("%Y-%m-%d--%H-%M-%S")
#file_name = "Article_KRDO_" +date+ ".csv"
#data.to_csv(file_name, index = False)




# The below code is old but a good idea for how we would store the scrape with omniscient
# Storing a copy for analysts in the "analysts" dir
#data.to_csv( "/dbfs/mnt/analysts/" + file_name, index = False)

In [17]:
#!pip install boto3

In [18]:
#import boto3

In [19]:
#s3 = boto3.resource(
 #   service_name='s3',
  #  region_name='us-east-2',
   # aws_access_key_id='AKIAUOZ7CQ6Y2T76W3PT',
   # aws_secret_access_key='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'
#)

In [20]:
#for bucket in s3.buckets.all():
 # print(bucket.name)