In [13]:
import sys
import re

f= open(r'stopwords_en.txt')
fh = f.read()
stop_words = []
for word in fh:
  stop_words.append(word)
print(len(stop_words))

3586


In [38]:
from bs4 import BeautifulSoup
from dateutil.parser import parse as date_parser


class CustomExtractor:
    def __init__(self):
        self.stopwords = set(stop_words)

    def calculate_gravity_score(self, tag):
        # Your custom logic to calculate the gravity score
        # This example uses the length of text content as a score
        text_content = tag.get_text(strip=True)
        return len(text_content)

    def walk_siblings(self, node):
        # Iterate over siblings
        siblings = []
        for sibling in node.find_all_next():
            if sibling.name and sibling.name != 'text':
                siblings.append(sibling)
        return siblings

    def is_highlink_density(self, node):

        # For simplicity, this example checks if the node contains more than 5 links
        links = node.find_all('a')
        return len(links) > 5

    def is_boostable(self, node):
        para = "p"
        steps_away = 0
        minimum_stopword_count = 5
        max_stepsaway_from_node = 3

        nodes = self.walk_siblings(node)
        for current_node in nodes:
            current_node_tag = current_node.name
            if current_node_tag == para:
                if steps_away >= max_stepsaway_from_node:
                    return False
                paragraph_text = current_node.get_text(strip=True)
                word_stats = self.get_stopword_count(paragraph_text)
                if word_stats > minimum_stopword_count:
                    return True
                steps_away += 1
        return False

    def get_stopword_count(self, text):
        words = [word.lower() for word in text.split()]
        sm = 0
        for word in words:
          if(word in self.stopwords):
            sm+=1
        return sm


    def calculate_best_node(self, soup):
        top_node = None
        top_node_score = 0
        nodes_to_check = self.nodes_to_check(soup)
        # print(nodes_to_check)
        starting_boost = 1.0
        cnt = 0
        i = 0
        parent_nodes = []
        nodes_with_text = []


        for node in nodes_to_check:
            text_node = node.get_text(strip=True)
            word_stats = self.get_stopword_count(text_node)

            high_link_density = self.is_highlink_density(node)
            if word_stats >= 2 and not high_link_density:
                nodes_with_text.append(node)

        nodes_number = len(nodes_with_text)
        negative_scoring = 0
        bottom_negativescore_nodes = nodes_number * 0.25


        for node in nodes_with_text:

            boost_score = 0.0
            if self.is_boostable(node):
                if cnt >= 0:
                    boost_score = 1.0 / starting_boost * 50
                    starting_boost += 1

            if nodes_number > 15:
                if (nodes_number - i) <= bottom_negativescore_nodes:
                    booster = bottom_negativescore_nodes - (nodes_number - i)
                    boost_score = -pow(booster, 2)
                    negscore = abs(boost_score) + negative_scoring
                    if negscore > 40:
                        boost_score = 5.0


            text_node = node.get_text(strip=True)
            word_stats = self.get_stopword_count(text_node)
            upscore = word_stats + boost_score

            parent_node = node.parent
            self.update_score(parent_node, upscore)
            self.update_node_count(parent_node, 1)

            if parent_node not in parent_nodes:
                parent_nodes.append(parent_node)

            parent_parent_node = parent_node.parent
            if parent_parent_node is not None:
                self.update_node_count(parent_parent_node, 1)
                self.update_score(parent_parent_node, upscore / 2)
                if parent_parent_node not in parent_nodes:
                    parent_nodes.append(parent_parent_node)
            cnt += 1
            i += 1

        for e in parent_nodes:
            score = self.get_score(e)

            if score > top_node_score:
                top_node = e
                top_node_score = score

            if top_node is None:
                top_node = e

        return top_node

    def nodes_to_check(self, soup):
        nodes_to_check = []
        for tag in ['p', 'pre', 'td','div']:
            items = soup.find_all(tag)
            nodes_to_check += items
        return nodes_to_check

    def update_score(self, node, score):
        if 'score' not in node:
            node['score'] = len(node.get_text(strip=True))
        node['score'] += score

    def update_node_count(self, node, count):
        if 'count' not in node:
            node['count'] = 0
        node['count'] += count

    def get_score(self, node):
        return node.get('score', 0)
 
    def get_authors(self, doc):
        def contains_digits(d):
            for char in d:
                if char.isdigit():
                    return True
            return False

        def unique_list(lst):
            count = {}
            list = []
            for item in lst:
                if item.lower() in count:
                    continue
                count[item.lower()] = 1
                list.append(item.title())
            return list
    
        def parse(str):
            
            str = ''.join(char for char in str if char != '<' and char != '>')

            str = str.replace('By:', '').replace('From:', '').strip()
            
            name_tokens = [s.strip() for s in re.split(r"[^\w\'\-\.]", str) if s]

    
            authors = []
            current = []
            delimiters = ['and', ',', '']
    
            for token in name_tokens:
                if token in delimiters:
                    if len(current) > 0:
                        authors.append(' '.join(current))
                        current = []
                elif not contains_digits(token):
                    current.append(token)
    
            valid_name = (len(current) >= 2)
            if valid_name:
                authors.append(' '.join(current))
    
            return authors
    
        attributes = ['name', 'rel', 'itemprop', 'class', 'id']
        variables = ['author', 'byline', 'dc.creator', 'byl']
        matches = []
        authors = []
    
        for attr in attributes:
            for val in variables:
                found = doc.find_all(attrs={attr: val})
                matches.extend(found)
        

        for match in matches:
            content = ''
            if match.tag == 'meta':
                content_value = match.get('content')
                if len(content_value) > 0:
                    content = content_value[0]
            else:
                content = match.get_text() or ''
            if len(content) > 0:
                authors.extend(parse(content))
    
        return unique_list(authors)
    
    def publishing_date(self, url, doc):
        def parse_date(str):
            if str:
                try:
                    return date_parser(str)
                except (ValueError, OverflowError, AttributeError, TypeError):
                    return None
    
        
        STRICT_DATE_REGEX = re.compile(r'\/(\d{4})\/(\d{2})\/(\d{2})\/')
        date_match = STRICT_DATE_REGEX.search(url)
        if date_match:
            str = date_match.group(0)
            datetime = parse_date(str)
            if datetime:
                return datetime
    
        
        date_tags = [
            {'attribute': ('property', 'rnews:datePublished'), 'content': 'content'},
            {'attribute': ('property', 'article:published_time'), 'content': 'content'},
            {'attribute': ('name', 'OriginalPublicationDate'), 'content': 'content'},
            {'attribute': ('itemprop', 'datePublished'), 'content': 'datetime'},
            {'attribute': ('property', 'og:published_time'), 'content': 'content'},
            {'attribute': ('name', 'article_date_original'), 'content': 'content'},
            {'attribute': ('name', 'publication_date'), 'content': 'content'},
            {'attribute': ('name', 'sailthru.date'), 'content': 'content'},
            {'attribute': ('name', 'PublishDate'), 'content': 'content'},
            {'attribute': ('pubdate', 'pubdate'), 'content': 'datetime'},
            {'attribute': ('name', 'publish_date'), 'content': 'content'},
        ]
    
        
        for known_meta_tag in date_tags:
            meta_tags = doc.find_all(attrs={known_meta_tag['attribute'][0]: known_meta_tag['attribute'][1]})
            if meta_tags:
                str = meta_tags[0].get(known_meta_tag['content'])
                datetime = parse_date(str)
                if datetime:
                    return datetime
    
        # If none of the strategies work, return None
        return None


In [42]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlencode

url ="https://edition.cnn.com/2024/01/14/asia/profile-lai-ching-te-taiwan-new-president-intl-hnk/index.html"
# proxies = {
# "http": "http://scraperapi:8355bf750256f87924cb321115d06996@proxy-server.scraperapi.com:8001"
# }
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

# response = requests.get(url,headers=headers,proxies=proxies,verify=False)
API_KEY = "8355bf750256f87924cb321115d06996"
params = {'api_key': API_KEY, 'url': url}
response = requests.get('http://api.scraperapi.com/', params=urlencode(params))
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

# print(soup)
# Create an instance of CustomExtractor
extractor = CustomExtractor()


authors_list=extractor.get_authors(soup);
print("article of author are")

if authors_list:
    
    for author in authors_list:
        print(author)
       
else:
    print("No authors found.")
date=extractor.publishing_date(url,soup)

print("date of publication:",date)

# Calculate best node based on custom gravity scores
best_node = extractor.calculate_best_node(soup)

# Now, you can access the best node and its gravity score
if best_node:
    print(f"Best Node: {best_node['class']}, Gravity Score: {best_node.get('score', 0)}")
    print(best_node.get_text(strip=True))
else:
    print("No best node found.")


article of author are
By Nectar Gan Cnn
date of publication: 2024-01-14 00:00:00
Best Node: ['article__content'], Gravity Score: 9941.0
CNN—Lai Ching-te, a former doctor from a poor mining family, was propelled into politics by a military crisis in the Taiwan Strait 27 years ago.Now, the soft-spoken political veteran is tasked with preventing another one from happening as the newly elected leader of the self-ruled island that China’s Communist Party has vowed to one day absorb.On Saturday, Lai, 64, the current vice president from the ruling Democratic Progressive Party (DPP),won a widely watched electionto become Taiwan’s next president.His victory handed the DPP a historic third consecutive term, delivering a snub toyears of growing threatsfrom Taiwan’s much-larger authoritarian neighbor, China.“The election has shown the world the commitment of the Taiwanese people to democracy, which I hope China can understand,” Lai told thousands of jubilant supporters at a rally after his win.Lai