<a href="https://colab.research.google.com/github/M-Jak/Finki/blob/main/Voved%20vo%20nauka%20na%20podatoci/aud%20scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import bs4

#Note: Imports for the modules that we need
import numpy as np
import pandas as pd

#Note: Requests is a simple, yet elegant, HTTP library.
#Resource: https://pypi.org/project/requests/
import requests

#Note: Beautiful Soup is a library that makes it easy to scrape information from web pages.
#Note: In the following resource you can find a way to select any part of a HTML content.
#Resource: https://pypi.org/project/beautifulsoup4/
from bs4 import BeautifulSoup

#Note: IPython is a command shell for interactive computing in Python.
#Note: IPython.display is a module for display tools in IPython.
#Resource: https://ipython.readthedocs.io/en/stable/api/generated/IPython.display.html
from IPython.display import HTML


#Note: We can disable warnings for our requests.
import warnings

requests.packages.urllib3.disable_warnings()
warnings.filterwarnings("ignore")

finance_url = 'https://www.cnbc.com/finance/'



response = requests.get(finance_url)
response




type(response)


requests.models.Response


raw_html = response.text
type(raw_html)


str


raw_html[:500]




#Note: To represent the HTML code and make it easy to use, we use Beautiful Soup.
html = BeautifulSoup(raw_html, "html.parser")


type(html)


bs4.BeautifulSoup


#Note: We can now use the HTML code as normally, using selection by tag, id etc.

#Note: Here we select the first element that is a title.
html.select_one("title")



from google.colab import drive

drive.mount('/content/drive')




#Note: Path needs to be changed in order to work with current user Google Drive.


from datetime import date
import random


#The function takes the 'url' of the page to scrape and the 'path' in which to create the file containg the resulting dataset.
def get_data(url, path):
  response = requests.get(url)
  html = BeautifulSoup(response.text, "html.parser")

  news_items = html.find_all('div', {'class' : 'Layout-layout'})
  print(news_items)
  news = []

  #Note: By taking the data from each item separately we make sure that all data we have on the item is its own.
  for i in range(0, len(news_items)):
    item = {}
    card_date = news_items[i].select_one('.Card-time').text

    #Note: We need to format the date in the desired format.
    if 'ago' in card_date:
      item['Date'] = date.today().strftime("%a, %b %dth %Y")
    else:
      item['Date'] = card_date

    item["Title"] = news_items[i].select_one('.Card-title').text

    category = news_items[i].select_one('.Card-eyebrow')

    #Note: We fill the category to be 'No category' in cases where a category is not present.
    if category is None:
      item['Category'] = 'No category'
    else:
       item["Category"] = news_items[i].select_one('.Card-eyebrow').select_one('div').text

    #Note: This data is not related to the web pages we scrape, and are here for demonstration only.
    item['Popularity'] = random.randint(1, len(news_items)) * 10
    item['User comments'] = random.randint(70, 100)

    news.append(item)

  #Note: The idea here is to make the function as automated as possible i.e. for us to save parameters in some place and then call the function X times with those parameters without us watching over it.
  df = pd.DataFrame(news)
  url_parts = url.split("/")
  file_name = path + "/results_" + url_parts[3] + ".csv"
  df.to_csv(file_name, index=False)

  return df


economy_url = "https://www.cnbc.com/economy/"
finance_url = "https://www.cnbc.com/finance/"
health_science_url = "https://www.cnbc.com/health-and-science/"
energy_url = "https://www.cnbc.com/energy/"
climate_url = "https://www.cnbc.com/climate/"

economy_data = get_data(economy_url, path)
finance_data = get_data(finance_url, path)
health_science_data = get_data(health_science_url, path)
energy_data = get_data(energy_url, path)
climate_data = get_data(climate_url, path)




type(economy_data)




economy_data.head(3)


finance_data.head()


health_science_data.head()


energy_data.head()

climate_data.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[<div class="Layout-layout" data-test="Layout"><div class="Column-imageDenseModRight" data-test="Column"><div class="Card-standardBreakerCard Card-featuredRectangleMediaImagedense Card-featuredRectangleMedia Card-card" data-test="Card"><div class="Card-mediaContainer"><a href="https://www.cnbc.com/2023/10/20/us-wraps-up-fiscal-year-with-a-budget-deficit-near-1point7-trillion.html" tabindex="-1" target=""><div class="Card-imageContainer"><div class="Card-rectangleMediaContainer Card-placeholder"><picture data-test="Picture"><source height="630" media="(min-width: 1340px)" srcset="https://image.cnbcfm.com/api/v1/image/107304533-16953079272023-09-21t144708z_668111486_rc21d3alzj7b_rtrmadp_0_un-assembly.jpeg?v=1697836710&amp;w=1260&amp;h=630&amp;ffmt=webp&amp;vtcrop=y" type="image/webp" width="1260"/><source height="410" media="(min-width: 1020px)" srcset="https:/

Unnamed: 0,Date,Title,Category,Popularity,User comments
0,"Sat, Oct 21st 2023",UN committee deadlocked on climate disaster re...,No category,50,97
1,"Thu, Oct 19th 2023",How El Niño will affect the weather this winter,No category,20,86
2,"Thu, Oct 19th 2023",Chinese EV stocks tank after Tesla earnings di...,Tech,30,97
3,"Tue, Oct 17th 2023",This startup CEO claims he can completely deca...,Clean Start,10,78
4,"Mon, Oct 16th 2023",A big climate change stress test is coming for...,Evolve,70,78
