In [47]:
from lxml import html
import requests



class HTMLParser:
    def __init__(self, html_content):
        self.tree = html.fromstring(html_content)

    def get_text_by_tag(self, tag_name=None, attribute_name=None, attribute_value=None):
        if tag_name and attribute_name and attribute_value:
            xpath_query = f'//{tag_name}[@{attribute_name}="{attribute_value}"]'
        elif not tag_name and attribute_name and attribute_value:
        
            xpath_query = f'//*[@{attribute_name}="{attribute_value}"]'
        else:
            xpath_query = f'//{tag_name}'
    
        elements = self.tree.xpath(xpath_query)
    
        result_list = []
        for element in elements:
            result_list.append(element.text_content().strip())
    
        return result_list


    
    def parent(self, tag_name=None, attribute_name=None, attribute_value=None):
        result_list = []
    
        if tag_name and attribute_name and attribute_value:
            xpath_query = f'//{tag_name}[@{attribute_name}="{attribute_value}"]'
        elif not tag_name and attribute_name and attribute_value:
            xpath_query = f'//*[@{attribute_name}="{attribute_value}"]'
        else:
            xpath_query = f'//{tag_name}'
    
        elements = self.tree.xpath(xpath_query)
    
        for element in elements:
            parent = element.getparent()
            if parent is not None:
                result_dict = {'tag': parent.tag, 'attributes': {}}
                
                
                for key, value in parent.items():
                    result_dict['attributes'][key] = value
                
                result_list.append(result_dict)
    
        return result_list


    def children(self, tag_name=None, attribute_name=None, attribute_value=None):
        result_list = []
        if tag_name and attribute_name and attribute_value:
            xpath_query = f'//{tag_name}[@{attribute_name}="{attribute_value}"]'
        elif not tag_name and attribute_name and attribute_value:
            xpath_query = f'//*[@{attribute_name}="{attribute_value}"]'
        else:
            xpath_query = f'//{tag_name}'
        
        elements = self.tree.xpath(xpath_query)
        for element in elements:
            parent = element.getparent()
            if parent is not None:
                result_dict = {'tag': parent.tag, 'attributes': {}}
                    
                    
                for key, value in parent.items():
                    result_dict['attributes'][key] = value
                    
                result_list.append(result_dict)
        
        return result_list
    
    def find_all_next(self, tag_name=None, attribute_name=None, attribute_value=None):
        result_list = []

        if tag_name and attribute_name and attribute_value:
            xpath_query = f'//{tag_name}[@{attribute_name}="{attribute_value}"]/following-sibling::*'
        elif not tag_name and attribute_name and attribute_value:
            xpath_query = f'//*[@{attribute_name}="{attribute_value}"]/following-sibling::*'
        elif tag_name:
            xpath_query = f'//{tag_name}/following-sibling::*'
        else:
        # Handle the case where neither tag_name nor attribute_name and attribute_value are provided
            return result_list

        elements = self.tree.xpath(xpath_query)

        for element in elements:
            result_dict = {'tag': element.tag,  'attributes': {}}

            # Append attributes and their values to the result_dict
            for key, value in element.items():
                result_dict['attributes'][key] = value

            result_list.append(result_dict)

        return result_list


    def find_all_tags(self, tag_name):
        result_list = []

        elements = self.tree.xpath(f'//{tag_name}')

        for element in elements:
            result_dict = {'tag': element.tag, 'attributes': {}}

            # Append attributes and their values to the result_dict
            for key, value in element.items():
                result_dict['attributes'][key] = value

            result_list.append(result_dict)

        return result_list


    def find_all_by_attribute(self, attribute_name=None, attribute_value=None):
        result_list = []

        if attribute_name and attribute_value:
            xpath_query = f'//*[@{attribute_name}="{attribute_value}"]'
        elif attribute_name:
            xpath_query = f'//*[@{attribute_name}]'
        elif attribute_value:
            xpath_query = f'//*[@*="{attribute_value}"]'
        else:
            # Handle the case where neither attribute_name nor attribute_value is provided
            return result_list
    
        elements = self.tree.xpath(xpath_query)
    
        for element in elements:
            result_dict = {'tag': element.tag, 'attributes': {}}
    
            # Append attributes and their values to the result_dict
            for key, value in element.items():
                result_dict['attributes'][key] = value
    
            result_list.append(result_dict)
    
        return result_list





In [48]:
url="https://edition.cnn.com/middleeast/live-news/israel-hamas-war-gaza-news-02-04-24/index.html"
response = requests.get(url)
response.raise_for_status()  # Raise an HTTPError for bad responses

html_content= response.text
parser=HTMLParser(html_content)


print(parser.get_text_by_tag(attribute_name="data-type",attribute_value="byline-area"))
print(parser.parent(attribute_name="data-type",attribute_value="byline-area"))
print(parser.children("div","class","sc-dnqmqq sc-hSdWYo fKJYAI"))
print(parser.find_all_next("div"))
print()
print()






['By Andrew Raine and Heather Chen, CNN']
[{'tag': 'div', 'attributes': {'class': 'sc-dnqmqq sc-hSdWYo fKJYAI'}}]
[{'tag': 'div', 'attributes': {'class': 'headline'}}]
[{'tag': 'div', 'attributes': {'class': 'sc-dBAPYN fJPuOh'}}, {'tag': 'div', 'attributes': {'mode': 'dark', 'class': 'sc-dKEPtC eMwtza'}}, {'tag': 'div', 'attributes': {'class': 'sc-eIHaNI iYNlZu'}}, {'tag': 'div', 'attributes': {'class': 'sc-ugnQR cacWpd'}}, {'tag': 'div', 'attributes': {'class': 'sc-gzVnrw sc-gtfDJT sc-fOICqy fAKPnY'}}, {'tag': 'div', 'attributes': {'class': 'sc-bdVaJa sc-ipXKqB gaNIBv'}}, {'tag': 'div', 'attributes': {'class': 'sc-bdVaJa fAvHax'}}, {'tag': 'div', 'attributes': {'class': 'sc-gzVnrw sc-bEjcJn iIifdK'}}, {'tag': 'div', 'attributes': {'class': 'sc-gzVnrw sc-gtfDJT dQXmwZ'}}, {'tag': 'div', 'attributes': {'style': 'false:unset', 'class': 'sc-htoDjs sc-etwtAo sc-iGPElx esqSIM'}}, {'tag': 'div', 'attributes': {'class': 'sc-bdVaJa kRdOVD'}}, {'tag': 'div', 'attributes': {'class': 'sc-bdVaJa b

In [49]:
print(parser.find_all_by_attribute("class"))

[{'tag': 'div', 'attributes': {'class': 'sc-dBAPYN fJPuOh'}}, {'tag': 'div', 'attributes': {'mode': 'dark', 'class': 'sc-dKEPtC eMwtza'}}, {'tag': 'div', 'attributes': {'class': 'sc-eIHaNI iYNlZu'}}, {'tag': 'div', 'attributes': {'class': 'sc-ugnQR cacWpd'}}, {'tag': 'header', 'attributes': {'mode': 'dark', 'id': 'header-nav-container', 'data-zjs-campaign': 'header-wrapper', 'class': 'sc-bxivhb sc-fQejPQ dpgTwu'}}, {'tag': 'div', 'attributes': {'class': 'sc-dnqmqq sc-clNaTc rkUIQ'}}, {'tag': 'div', 'attributes': {'class': 'sc-htoDjs sc-etwtAo kysRUW'}}, {'tag': 'div', 'attributes': {'class': 'sc-bZQynM sc-jXQZqI eddiF'}}, {'tag': 'button', 'attributes': {'id': 'menuButton', 'tabindex': '-1', 'data-test': 'menuButton', 'data-zjs': 'click', 'data-zjs-campaign': 'header-expanded-nav-btn', 'class': 'sc-gmeYpB sc-iSDuPN bzpxnC'}}, {'tag': 'svg', 'attributes': {'class': 'menu-icon', 'style': 'outline:0', 'width': '24', 'height': '24', 'viewbox': '0 0 64 64', 'fill': 'currentColor', 'tabindex

In [50]:
print(parser.find_all_by_attribute("class","sc-dnqmqq sc-hSdWYo fKJYAI"))

[{'tag': 'div', 'attributes': {'class': 'sc-dnqmqq sc-hSdWYo fKJYAI'}}]


In [51]:
print(parser.find_all_by_attribute(attribute_value="sc-dnqmqq sc-hSdWYo fKJYAI"))

[{'tag': 'div', 'attributes': {'class': 'sc-dnqmqq sc-hSdWYo fKJYAI'}}]
