In [1]:
#import libraries
from bs4 import BeautifulSoup
import requests
import json

In [2]:
#read website from file
with open("../scrapeContent/website_to_scrape.txt", "r") as file:
    website = file.read()

In [3]:
#get html from website
result = requests.get(website)
content = result.text

In [4]:
#get text from html
if result.status_code == 200:
    soup = BeautifulSoup(result.content, 'html.parser')

    all_text = soup.get_text(separator='\n', strip=True)
else:
    print(f"Failed to retrieve the webpage '{website}'. Status code: {result.status_code}")

status_code = result.status_code


In [5]:
#create a json obj with the desired content, ignoring unecessary elements
lines = all_text.strip().split('\n')

#get meta description if any
description_tag = soup.find("meta", attrs={"name": "description"})
description = description_tag.get("content") if description_tag else ""

#get alt images texts
alt_images_texts = [img.get("alt") for img in soup.find_all("img") if img.get("alt")]

jsonObj = {"website": website, "description":description, "alt_images_texts": alt_images_texts, "content": []}

#ignore these elements
#ignoreElements = ["button", "script", "div", "span", "a", "b", "style"]
findTheseElements = ["title", "alt_images_texts", "p", "h1", "h2", "h3", "h4", "h5", "h6"]

for line in lines:
    found = soup.find_all(string = lambda text: text and line.strip() in text.strip())
    for item in found:
        #get text from elements
        text = line.strip().lower()
        
        if len(text) > 1 and not text.isnumeric() and item.parent.name in findTheseElements:

            #get languages of the text
            lang = item.find_parent(attrs={"lang": True})
            lang = lang["lang"][:2] if lang and lang["lang"] else "unknown"
            
            #store everything and get element name of each text
            jsonObj["content"].append({"text": text, "element": item.parent.name, "source_language": lang})


In [6]:
#get all unique languages in the page

content = jsonObj["content"]

all_source_languages = list(set(source_language["source_language"] for source_language in content))


In [7]:
#create the final jsonObj
processed_json = {"website": website, "description":description, "all_source_languages": all_source_languages, "alt_images_texts": alt_images_texts, "content": jsonObj["content"], "status_code":status_code }

processed_jsonObj = json.dumps(processed_json, indent=4)

In [8]:
#store the json in a json file
with open("../scrapeContent/scraped_content_output.json", "w") as file:
    file.write(processed_jsonObj)