# 1. Using BeautifulSoup

In [None]:
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd

## 1.1 Parsing an Article From Kharon

Parsing a simple paragraph from "https://brief.kharon.com/updates/nigerian-men-sanctioned-by-us-for-business-email-compromise-romance-fraud/".

In [24]:
# define destination page
url = "https://brief.kharon.com/updates/nigerian-men-sanctioned-by-us-for-business-email-compromise-romance-fraud/"

In [38]:
# read the page
def get_soup(url):
    uClient = uReq(url)
    page_html = uClient.read()
    page_soup = soup(page_html, "html.parser")
    return page_soup

# remove control characters from string
import unicodedata
def remove_control_characters(s):
    return "".join(ch for ch in s if unicodedata.category(ch)[0]!="C")

In [39]:
# locate content by class name
page_soup = get_soup(url)
content = page_soup.findAll("div", {"class":"copy"})

In [40]:
# concatenate paragraphs into one string
text = ""
for paragraph in content:
    text = text + "\n" + paragraph.text

In [47]:
# text processing
text = remove_control_characters(text).replace("\xa0", " ")
print(text[:1000])

The U.S. Treasury Department sanctioned six Nigerian men for their roles in cybercriminal schemes to steal more than USD 6 million from businesses and individuals across the U.S.The sanctioned individuals pursued Americans through global threats known as business email compromise (BEC) and romance fraud, according to the Treasury. They manipulated victims to gain access to usernames, passwords and bank accounts to further the schemes, the Treasury said. The romance fraudsters used online tools to engage their targets, the Treasury said. “Cybercriminals prey on vulnerable Americans and small businesses to deceive and defraud them,” said Treasury Secretary Steven T. Mnuchin. “As technological advancement increasingly offers malicious actors tools that can be used for online attacks and schemes, the United States will continue to protect and defend at-risk Americans and businesses.”The six Nigerians were sanctioned under the Treasury’s authority to designate people or companies engaged in

## 1.2 Parsing Game Info From Steam to Form a Table

Parsing item name, item price, game genres, discount, and os of top sellers on steam.

https://store.steampowered.com/search/?filter=topsellers&category1=998,996

In [185]:
url = "https://store.steampowered.com/search/?filter=topsellers&category1=998,996"
name, price, original_price, discount, os = list(), list(), list(), list(), list()

In [186]:
page_soup = get_soup(url)
item_sections = page_soup.findAll("div", {"class":"responsive_search_name_combined"})

In [187]:
for item in item_sections:
    item_name = item.find("span", {"class":"title"}).text
    item_os = ""
    for span_os in item.find("p").findAll("span"):
        try:
            item_os += "/" + str(span_os).split("platform_img")[1].split("\"")[0].strip()
        except:
            pass
    item_os = item_os[1:]
    try:
        item_price = item.find("div", {"class":"col search_price responsive_secondrow"}).text.strip()
        item_original_price = item_price
        item_discount = None
    except:
        try:
            price_section = item.find("div", {"class":"col search_price discounted responsive_secondrow"})
            item_original_price = price_section.find("strike").text
            item_price = price_section.text.replace(item_original_price, "").strip()
            item_discount = item.find("div", {"class":"col search_discount responsive_secondrow"}).find("span").text 
        except:
            item_original_price, item_price, item_discount =  None, None, None
    name += [item_name]
    price += [item_price]
    original_price += [item_original_price]
    discount += [item_discount] 
    os += [item_os]

In [190]:
df_topsellers = pd.DataFrame({"name":name, "price":price, "original_price":original_price,
                              "discount":discount, "os":os})
df_topsellers.head()

Unnamed: 0,name,price,original_price,discount,os
0,Hardspace: Shipbreaker,$19.99,$24.99,-20%,win
1,Destiny 2,Free To Play,Free To Play,,win
2,Dead by Daylight,$7.99,$19.99,-60%,win
3,Counter-Strike: Global Offensive,Free to Play,Free to Play,,win/mac/linux
4,Sea of Thieves,$39.99,$39.99,,win


# 2. Using Selenium

## 2.1 Parsing an Article From Kharon

Parsing a simple paragraph from "https://brief.kharon.com/updates/nigerian-men-sanctioned-by-us-for-business-email-compromise-romance-fraud/".

In [200]:
from selenium import webdriver
import pandas as pd
import numpy as np

In [201]:
# initial Setup
browser = webdriver.Chrome()
url = "https://brief.kharon.com/updates/nigerian-men-sanctioned-by-us-for-business-email-compromise-romance-fraud/"

In [203]:
browser.get(url)
content = browser.find_elements_by_class_name("copy")

In [210]:
# concatenate paragraphs into one string
text = ""
for paragraph in content:
    text = text + " " + paragraph.text

In [212]:
# text processing
text = remove_control_characters(text)
print(text[:1000])

 The U.S. Treasury Department sanctioned six Nigerian men for their roles in cybercriminal schemes to steal more than USD 6 million from businesses and individuals across the U.S.The sanctioned individuals pursued Americans through global threats known as business email compromise (BEC) and romance fraud, according to the Treasury. They manipulated victims to gain access to usernames, passwords and bank accounts to further the schemes, the Treasury said. The romance fraudsters used online tools to engage their targets, the Treasury said. “Cybercriminals prey on vulnerable Americans and small businesses to deceive and defraud them,” said Treasury Secretary Steven T. Mnuchin. “As technological advancement increasingly offers malicious actors tools that can be used for online attacks and schemes, the United States will continue to protect and defend at-risk Americans and businesses.”The six Nigerians were sanctioned under the Treasury’s authority to designate people or companies engaged i

In [213]:
# close the browser
browser.close()