# Task Description:
#### You are tasked to perform web scraping on a provided HTML page that contains different types of elements. The goal is to extract specific data from the page and process it into structured formats such as CSV or JSON.
#### https://baraasalout.github.io/test.html 

In [1]:
import requests #to call infos from websites
from bs4 import BeautifulSoup
import csv
import json

In [2]:
#get the website content/source
url = 'https://baraasalout.github.io/test.html'
website = requests.get(url)
scr = website.content # get all the page data/informations as byte code
soup = BeautifulSoup(scr, 'lxml') #get all the page data/informations as readable content

### 1. Extract Text Data:
 - Extract all headings (`<h1>`, `<h2>`).
 - Extract all text content inside `<p>` and `<li>` tags.
 - Save this data into a `Extract_Text_Data.CSV` file.

In [3]:
#Extract all headings (<h1>, <h2>).
# 1st method to get all the required headers
def headers (soup):
    # get all h1&h2 tags
    header1= soup.find_all('h1') 
    header2= soup.find_all('h2') 
    #empty list to store the header h1&h2 text 
    header1_text = [] 
    header2_text = [] 
    #loop over the list of tags to get the text only
    for header in header1: 
        text = header.text.strip()
        header1_text.append(text)
        
    for header in header2: 
        text = header.text.strip()
        header2_text.append(text)
    return header1_text, header2_text

#Extract all text content inside <p> and <li> tags
# 2nd method to get all the required paragraphs and lists
def paragraph_and_list(soup):
    # get all p & li tags
    paragraph = soup.find_all('p')
    lists = soup.find_all('li')
    #empty list to store the header p & li text
    parg_text = []
    list_text = []
    #loop over the list of tags to get the text only
    for p in paragraph:
        text = p.text.strip()
        parg_text.append(text)

    for l in lists:
        text = l.text.strip()
        list_text.append(text)
    return parg_text ,list_text

#create the header of the table in the .csv file
tableHeader = ["Type" , "Content"]

#call the h1 & h2 tags method
header1_text, header2_text = headers(soup)

#call the p & li tags method
parg_text ,list_text = paragraph_and_list(soup)

#create a list of data rows of the .csv file
rowsData = []
for text in header1_text + header2_text:
    rowsData.append({"Type": "Heading", "Content": text})

for text in parg_text:
    rowsData.append({"Type": "Paragraph", "Content": text})

for text in list_text:
    rowsData.append({"Type": "List", "Content": text})

# Save this data into a Extract_Text_Data.CSV file.
with open(r"D:/2025/DEPI/git_repos/Web_Scraping/Extract_Text_Data.csv","w", newline="", encoding="utf-8-sig") as file:
    dictionary_writer = csv.DictWriter(file,tableHeader )
    dictionary_writer.writeheader()
    dictionary_writer.writerows(rowsData)
    print(" Extract_Text_Data.CSV file created successfully!")


 Extract_Text_Data.CSV file created successfully!


### 2. Extract Table Data:
- Extract data from the table, including:
  - Product Name.
  - Price.
  - Stock Status.
- Save this data into a `Extract_Table_Data.CSV` file.


In [4]:
#method to get the informations in the table
def table_data (soup):
    tables = soup.find_all('table')
    #empty list to store the table header and body
    table_head = []
    table_body = []
    row_data = []
    for t in tables:
        # find all the head tags
        head = t.find_all('th') 
        #loop over the list of tags to get the text only
        for h in head:
            # return the head text only
            text = h.text.strip() 
            # add the text to the list
            table_head.append(text)

    #loop over the list of tags to get the text only
    for t in tables:
        # find all the rows of the table
        body = t.find_all('tr') 
        for b in body:
            #find each cell of the row
            all_td = b.find_all('td') 
            if all_td:
                # return the cells text
                row_data = [value.text.strip() for value in all_td]
                # add the text to the list
                table_body.append(row_data)
       
    #  Save this data into a Extract_Table_Data.CSV file.       
    with open(r"D:/2025/DEPI/git_repos/Web_Scraping/Extract_Table_Data.CSV.csv","w", newline="", encoding="utf-8-sig") as file:
        writer = csv.writer(file)
        writer.writerow(table_head)
        writer.writerows(table_body)
        print(" Extract_Text_Data.CSV file created successfully!")
    

table_data(soup)


 Extract_Text_Data.CSV file created successfully!


### 3. Extract Product Information (Cards Section):
- Extract data from the book cards at the bottom of the page, including:
    - Book Title.
    - Price.
    - Stock Availability.
    - Button text (e.g., "Add to basket").
- Save the data into a  Product_Information.JSON file.


In [5]:
# method to get the required data and create an output jason file
def book_section (soup, output_file = 'Product_Information.JSON'):
    #empty list to store the book title, price, stock availability <p>
    parg_text = []
    #empty list to store the bbutton text <button>
    button_text = []

    #go to the distination to get data
    books = soup.find_all('div' , {'class' : 'book-products'})
    #loop over the list of tags to get the text only
    for p in books:
        #find paragraphs
        parg = p.find_all('p')
        for text in parg:
            #store the text value into the list
            parg_text.append(text.text.strip())
    #loop over the list of tags to get the text only
    for b in books:
        # find buttons
        button = b.find_all('button')
        for text in button:
            #store the text value into the list
            button_text.append(text.text.strip())

    # store the data into dictionary to be sent to the .jason file
    data = {"paragrapgh" : parg_text, "button" : button_text}

    #Save the data into a Product_Information.JSON file.
    with open(output_file, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4, ensure_ascii=False)
        print("Product_Information.JSON file created successfully!")


book_section(soup)

Product_Information.JSON file created successfully!


### 4. Extract Form Details:
- Extract all input fields from the form, including:
    - Field name (e.g., username, password).
    - Input type (e.g., text, password, checkbox, etc.).
    - Default values, if any.
- Save the data into a  JSON file.


In [6]:
form_data = soup.find('form')
# method to get the required data and create an output jason file
def form_details (soup , output_file = 'Form.JSON'):
    # epmty list to store input field names <label>
    field_name = [] 
    #go to the labels
    label = form_data.find_all('label')
    #loop over the list of tags to get the text only
    for l in label:
        field_name.append(l.text.strip())

    # epmty list to store input field types
    input_type = []
    #go to inputs
    inp = form_data.find_all('input')
    #loop over the list of tags to get the text only
    for i in inp:
        input_type.append(i.get('type'))
        
    # epmty list to store default values
    def_values = []
    # go to the select section
    defualt = form_data.find_all('select')
    for d in defualt:
        #go to options as it has a default values
        option = d.find_all('option')
        #loop over the list of tags to get the text only
        for o in option:
            def_values.append(o.get('value'))

    # store the data into dictionary to be sent to the .jason file
    data = {"Field Name" : field_name,
           "Input Type" : input_type,
           "Default Values": def_values}
    
    # Save the data into a JSON file.
    with open(output_file, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4, ensure_ascii=False)
        print("Form.JSON file created successfully!")

form_details(soup)
    

Form.JSON file created successfully!


### 5. Extract Links and Multimedia:
- Extract the video link from the `<iframe>` tag.
- Save the data into a  JSON file.


In [7]:
# method to get the required data and create an output jason file
def media (soup, output_file = 'iFrame.JSON'):
    # got to iframe
    video = soup.find('iframe')
    # get the source ling
    video_src = video.get('src')

    # Save the data into a JSON file.
    with open(output_file, "w", encoding="utf-8") as file:
        json.dump(video_src, file, indent=4, ensure_ascii=False)
        print("iFrame.JSON file created successfully!")

media (soup)   

iFrame.JSON file created successfully!


### 6. Scraping Challenge:
Students must write a script to extract data from the Featured Products section with the following requirements:
- Product Name: Located within `<span class="name>`.
- Hidden Price: Located within `<span class="price">` , which has style="display: none;".
- Available Colors: Located within `<span class="colors">`.
- Product ID: The value stored in the `data-id` attribute.
- Example Output:
[
    - {'id': '101', 'name': 'Wireless Headphones', 'price': '$49.99', 'colors': 'Black, White, Blue'},
     {'id': '102', 'name': 'Smart Speaker', 'price': '$89.99', 'colors': 'Grey, Black'},
      {'id': '103', 'name': 'Smart Watch', 'price': '$149.99', 'colors': 'Black, Silver, Gold'}
]



In [8]:
# method to get the required data 
def featured_products(soup):
    # go to the required div
    f_products = soup.find_all('div' , {'class' : 'products'})
    # empty list to store the output
    data = []

    # loop over the div to get list for each class
    for i in range(len(f_products)):
        card_id =  f_products[i].find_all('div' , {'class' : 'product-card'})
        nameTag = f_products[i].find_all('p' , {'class' : 'name'})
        priceTag = f_products[i].find_all('p' , {'class' : 'price'})
        colorTag = f_products[i].find_all('p' , {'class' : 'colors'})
        #loop over the list of tags to get the text only
        for card , n , p , c in zip(card_id, nameTag,priceTag,colorTag):
            data_id = card.get('data-id')
            name = n.text.strip()
            price = p.text.strip()
            color = c.text.strip().replace("Available colors:" , "") # delete this section to be as the required output

            # store the data into the list
            data.append ( {"ID" : data_id ,"Name" : name, "Price": price , "Color": color})
        
    print (data)

featured_products(soup)

[{'ID': '101', 'Name': 'Wireless Headphones', 'Price': '$49.99', 'Color': ' Black, White, Blue'}, {'ID': '102', 'Name': 'Smart Speaker', 'Price': '$89.99', 'Color': ' Grey, Black'}, {'ID': '103', 'Name': 'Smart Watch', 'Price': '$149.99', 'Color': ' Black, Silver, Gold'}]
