In [6]:
from lxml import html
import requests



class HTMLParser:
    def __init__(self, html_content):
        self.tree = html.fromstring(html_content)
        self.html_content = html_content
        self.text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'div', 'a', 'li', 'td', 'th', 'strong', 'em', 'b', 'i']


    def all_text(self):
        result_text = ""
        for tag in self.text_tags:
            elements = self.tree.xpath(f'//{tag}')
            for element in elements:
                text = element.text_content().strip()
                if text:
                    result_text += text + " "  # Add a space between different text parts
        # Replace newline and tab characters with empty string
        result_text = result_text.replace('\n', ' ').replace('\t', ' ')
        # Split by spaces, remove empty strings, and join back
        result_text = ' '.join(filter(None, result_text.split(' ')))
        # Remove extra spaces
        result_text = ' '.join(result_text.split())
        return result_text.strip()
        
    def textbytag(self, tag_name=None, attribute_name=None, attribute_value=None):
        if tag_name and attribute_name and attribute_value:
            xpath_query = f'//{tag_name}[@{attribute_name}="{attribute_value}"]'
        elif not tag_name and attribute_name and attribute_value:
        
            xpath_query = f'//*[@{attribute_name}="{attribute_value}"]'
        else:
            xpath_query = f'//{tag_name}'
    
        elements = self.tree.xpath(xpath_query)
    
        result_list = []
        for element in elements:
            result_list.append(element.text_content().strip())
    
        return result_list

    def parent(self, tag_name=None, attribute_name=None, attribute_value=None):
        result_list = []
    
        if tag_name and attribute_name and attribute_value:
            xpath_query = f'//{tag_name}[@{attribute_name}="{attribute_value}"]'
        elif not tag_name and attribute_name and attribute_value:
            xpath_query = f'//*[@{attribute_name}="{attribute_value}"]'
        else:
            xpath_query = f'//{tag_name}'
    
        elements = self.tree.xpath(xpath_query)
        parents=[]
        
    
        for element in elements:
            parent = element.getparent()
            parents.append(parent)
        return parents

    def children(self, tag_name=None, attribute_name=None, attribute_value=None):
        result_list = []
    
        if tag_name and attribute_name and attribute_value:
            xpath_query = f'//{tag_name}[@{attribute_name}="{attribute_value}"]'
        elif not tag_name and attribute_name and attribute_value:
            xpath_query = f'//*[@{attribute_name}="{attribute_value}"]'
        else:
            xpath_query = f'//{tag_name}'
    
        elements = self.tree.xpath(xpath_query)
        childrens=[]
        
    
        for element in elements:
            children = element.getchildren()
            childrens.append(children)
        return childrens


    def sibling(self, tag_name=None, attribute_name=None, attribute_value=None):
        result_list = []

        if tag_name and attribute_name and attribute_value:
            xpath_query = f'//{tag_name}[@{attribute_name}="{attribute_value}"]/following-sibling::*'
        elif not tag_name and attribute_name and attribute_value:
            xpath_query = f'//*[@{attribute_name}="{attribute_value}"]/following-sibling::*'
        elif tag_name:
            xpath_query = f'//{tag_name}/following-sibling::*'
        else:
        # Handle the case where neither tag_name nor attribute_name and attribute_value are provided
            return result_list

        elements = self.tree.xpath(xpath_query)

        
        return elements
        
            


    
    

    def findalltags(self, tag_name):
        result_list = []

        elements = self.tree.xpath(f'//{tag_name}')

        for element in elements:
            result_dict = {'tag': element.tag, 'attributes': {}}

            # Append attributes and their values to the result_dict
            for key, value in element.items():
                result_dict['attributes'][key] = value

            result_list.append(result_dict)

        return result_list


    def findallbyattribute(self, attribute_name=None, attribute_value=None):
        result_list = []

        if attribute_name and attribute_value:
            xpath_query = f'//*[@{attribute_name}="{attribute_value}"]'
        elif attribute_name:
            xpath_query = f'//*[@{attribute_name}]'
        elif attribute_value:
            xpath_query = f'//*[@*="{attribute_value}"]'
        else:
            # Handle the case where neither attribute_name nor attribute_value is provided
            return result_list
    
        elements = self.tree.xpath(xpath_query)
    
        for element in elements:
            result_dict = {'tag': element.tag, 'attributes': {}}
    
            # Append attributes and their values to the result_dict
            for key, value in element.items():
                result_dict['attributes'][key] = value
    
            result_list.append(result_dict)
    
        return result_list


In [7]:
from urllib.parse import urlencode
url="https://edition.cnn.com/2024/02/09/climate/atlantic-circulation-collapse-weather-climate/index.html"
API_KEY = "8355bf750256f87924cb321115d06996"
params = {'api_key': API_KEY, 'url': url}
response = requests.get('http://api.scraperapi.com/', params=urlencode(params))
response.raise_for_status()
html_content=response.text
parser=HTMLParser(html_content)



# it is returning parent in form of object 
# if there is object we use .tag to fing tag name , .items() to find attribute info and text_content()

# print(parser.textbytag(attribute_name="class",attribute_value="paragraph inline-placeholder"))
# parents=parser.parent(attribute_name="class",attribute_value="paragraph inline-placeholder")
# for parent in parents:
#     print(parent,parent.text_content(),parent.tag,parent.items())






In [3]:
parser.all_text()



In [4]:
# data=parser.findallbyattribute("class")
# for info in data:
#     print(info,info.text_content(),info.tag,info.items())
