In [43]:
import sys
import re

f= open(r'stopwords_en.txt')
fh = f.read()
stop_words = []
for word in fh:
  stop_words.append(word)
print(len(stop_words))

3586


In [60]:
from bs4 import BeautifulSoup
from dateutil.parser import parse as date_parser


class CustomExtractor:
    def __init__(self):
        self.stopwords = set(stop_words)

    def calculate_gravity_score(self, tag):
        # Your custom logic to calculate the gravity score
        # This example uses the length of text content as a score
        text_content = tag.get_text(strip=True)
        return len(text_content)

    def walk_siblings(self, node):
        # Iterate over siblings
        siblings = []
        for sibling in node.find_all_next():
            if sibling.name and sibling.name != 'text':
                siblings.append(sibling)
        return siblings

    def is_highlink_density(self, node):

        # For simplicity, this example checks if the node contains more than 5 links
        links = node.find_all('a')
        return len(links) > 5

    def is_boostable(self, node):
        para = "p"
        steps_away = 0
        minimum_stopword_count = 5
        max_stepsaway_from_node = 3

        nodes = self.walk_siblings(node)
        for current_node in nodes:
            current_node_tag = current_node.name
            if current_node_tag == para:
                if steps_away >= max_stepsaway_from_node:
                    return False
                paragraph_text = current_node.get_text(strip=True)
                word_stats = self.get_stopword_count(paragraph_text)
                if word_stats > minimum_stopword_count:
                    return True
                steps_away += 1
        return False

    def get_stopword_count(self, text):
        words = [word.lower() for word in text.split()]
        sm = 0
        for word in words:
          if(word in self.stopwords):
            sm+=1
        return sm


    def calculate_best_node(self, soup):
        top_node = None
        top_node_score = 0
        nodes_to_check = self.nodes_to_check(soup)
        # print(nodes_to_check)
        starting_boost = 1.0
        cnt = 0
        i = 0
        parent_nodes = []
        nodes_with_text = []


        for node in nodes_to_check:
            text_node = node.get_text(strip=True)
            word_stats = self.get_stopword_count(text_node)

            high_link_density = self.is_highlink_density(node)
            if word_stats >= 2 and not high_link_density:
                nodes_with_text.append(node)

        nodes_number = len(nodes_with_text)
        negative_scoring = 0
        bottom_negativescore_nodes = nodes_number * 0.25


        for node in nodes_with_text:

            boost_score = 0.0
            if self.is_boostable(node):
                if cnt >= 0:
                    boost_score = 1.0 / starting_boost * 50
                    starting_boost += 1

            if nodes_number > 15:
                if (nodes_number - i) <= bottom_negativescore_nodes:
                    booster = bottom_negativescore_nodes - (nodes_number - i)
                    boost_score = -pow(booster, 2)
                    negscore = abs(boost_score) + negative_scoring
                    if negscore > 40:
                        boost_score = 5.0


            text_node = node.get_text(strip=True)
            word_stats = self.get_stopword_count(text_node)
            upscore = word_stats + boost_score

            parent_node = node.parent
            self.update_score(parent_node, upscore)
            self.update_node_count(parent_node, 1)

            if parent_node not in parent_nodes:
                parent_nodes.append(parent_node)

            parent_parent_node = parent_node.parent
            if parent_parent_node is not None:
                self.update_node_count(parent_parent_node, 1)
                self.update_score(parent_parent_node, upscore / 2)
                if parent_parent_node not in parent_nodes:
                    parent_nodes.append(parent_parent_node)
            cnt += 1
            i += 1

        for e in parent_nodes:
            score = self.get_score(e)

            if score > top_node_score:
                top_node = e
                top_node_score = score

            if top_node is None:
                top_node = e

        return top_node

    def nodes_to_check(self, soup):
        nodes_to_check = []
        for tag in ['p', 'pre', 'td','div']:
            items = soup.find_all(tag)
            nodes_to_check += items
        return nodes_to_check

    def update_score(self, node, score):
        if 'score' not in node:
            node['score'] = len(node.get_text(strip=True))
        node['score'] += score

    def update_node_count(self, node, count):
        if 'count' not in node:
            node['count'] = 0
        node['count'] += count

    def get_score(self, node):
        return node.get('score', 0)
 
    def get_authors(self, doc):
        def contains_digits(d):
            for char in d:
                if char.isdigit():
                    return True
            return False

        def unique_list(lst):
            count = {}
            list = []
            for item in lst:
                if item.lower() in count:
                    continue
                count[item.lower()] = 1
                list.append(item.title())
            return list
    
        def parse(str):
            
            str = ''.join(char for char in str if char != '<' and char != '>')

            str = str.replace('By:', '').replace('From:', '').strip()
            
            names = [s.strip() for s in re.split(r"[^\w\'\-\.]", str) if s]

    
            authors = []
            current = []
            delimiters = ['and', ',', '']
    
            for name in names:
                if name in delimiters:
                    if len(current) > 0:
                        authors.append(' '.join(current))
                        current = []
                elif not contains_digits(name):
                    current.append(name)
    
            valid_name = (len(current) >= 2)
            if valid_name:
                authors.append(' '.join(current))
    
            return authors
        patterns=[re.compile(r'(?:author.*?name|name.*?author)', re.IGNORECASE)]
        attributes = ['name', 'rel', 'itemprop', 'class', 'id']
        variables = ['author', 'byline', 'dc.creator', 'byl','group-info']
        matches = []
        authors = []
    
        for attr in attributes:
            for val in variables+patterns:
                found = doc.find_all(attrs={attr: val})
                matches.extend(found)
        

        for match in matches:
            content = ''
            if match.tag == 'meta':
                content_value = match.get('content')
                if len(content_value) > 0:
                    content = content_value[0]
            else:
                content = match.get_text() or ''
            if len(content) > 0:
                authors.extend(parse(content))
    
        return unique_list(authors)
    
    def publishing_date(self, url, doc):
        def parse_date(str):
            if str:
                try:
                    return date_parser(str)
                except (ValueError, OverflowError, AttributeError, TypeError):
                    return None
    
        
        STRICT_DATE_REGEX = re.compile(r'\/(\d{4})\/(\d{2})\/(\d{2})\/')
        date_pattern = re.compile(r'(\d{2} [a-zA-Z]{3} \d{4}) (\d{2}:\d{2}[APMapm]{2})')
        date_match = STRICT_DATE_REGEX.search(url)
        if date_match:
            str = date_match.group(0)
            datetime = parse_date(str)
            if datetime:
                return datetime
    
        
        date_tags = [
            {'attribute': ('property', 'rnews:datePublished'), 'content': 'content'},
            {'attribute': ('property', 'article:published_time'), 'content': 'content'},
            {'attribute': ('name', 'OriginalPublicationDate'), 'content': 'content'},
            {'attribute': ('itemprop', 'datePublished'), 'content': 'datetime'},
            {'attribute': ('property', 'og:published_time'), 'content': 'content'},
            {'attribute': ('name', 'article_date_original'), 'content': 'content'},
            {'attribute': ('name', 'publication_date'), 'content': 'content'},
            {'attribute': ('name', 'sailthru.date'), 'content': 'content'},
            {'attribute': ('name', 'PublishDate'), 'content': 'content'},
            {'attribute': ('pubdate', 'pubdate'), 'content': 'datetime'},
            {'attribute': ('name', 'publish_date'), 'content': 'content'},
        ]



        patterns = re.compile(r'(?:article.*publish|publish.*article|\bdate\b|\btime\b)')

        for tags in date_tags:
            meta_tags = doc.find_all(attrs={tags['attribute'][0]: tags['attribute'][1]})
            if meta_tags:
                str = meta_tags[0].get(tags['content'])
                datetime = parse_date(str)
                if datetime:
                    return datetime
        
        additional_date_tag = doc.find('div', class_=lambda c: c and patterns.search(c))
        if additional_date_tag:
            str = additional_date_tag.get_text(strip=True)
            match = date_pattern.search(str)
            if match:
                date_str, time_str = match.groups()

                # Combine date and time, then parse using dateutil.parser
                datetime_str = f"{date_str} {time_str}"
                datetime_obj = date_parser(datetime_str) 
            return datetime_obj
        

    
        # If none of the strategies work, return None
        return None
    def split_title(self,title, splitter, hint=None):
        """Split the title to best part possible"""
        large_text_length = 0
        large_text_index = 0
        title_pieces = title.split(splitter)
        if hint and hint!='':
            filter_regex = re.compile(r'[^a-zA-Z0-9\ ]')
            hint = filter_regex.sub('', hint).lower()
    
        # find the largest title piece
        for i, title_piece in enumerate(title_pieces):
            current = title_piece.strip()
            #Immediately break if any part matches
            if hint and hint in filter_regex.sub('', current).lower():
                large_text_index = i
                break
            if len(current) > large_text_length:
                large_text_length = len(current)
                large_text_index = i
    
    #     Even if no part matches with hint(h1) if prints simply the longest part as the parts
    #     are usually of independent meaning
        title = title_pieces[large_text_index]
        return title    
    def get_title(self,soup):
        """Explicit rules:
        1. title == h1, no need to split
        2. h1 similar to og:title, use h1
        3. title contains h1, title contains og:title, len(h1) > len(og:title), use h1
        4. title starts with og:title, use og:title
        5. use title, after splitting
        """
        title = ''
        title_element = soup.title
    
        # no title found
        if title_element is None or len(title_element) == 0:
            print("Error")
            return title
    
        # title elem found
        title_text = title_element.text
        used_delimeter = False
    
    #     title from h1
            # - extract the longest text from all h1 elements
        # - too short texts (fewer than 2 words) are discarded
        # - clean double spaces
    #     h1_element = soup.find_all('h1')[0]
    #     title_text_h1 = h1_element.text
        title_text_h1=''
        title_element_h1_list = soup.find_all('h1')
        title_text_h1_list = [tag.get_text(strip=True) for tag in title_element_h1_list]
        if title_text_h1_list:
            title_text_h1_list.sort(key=len, reverse=True)
            #longest title
            title_text_h1 = title_text_h1_list[0]
            # clean double spaces
            title_text_h1 = ' '.join([x for x in title_text_h1.split() if x])
        #title from meta tag(not user-visible)
        meta_tag_content = soup.find({'meta': {'property': 'og:title'}})
        if not meta_tag_content:
            meta_tag_content = soup.find({'meta': {'name': 'og:title'}})
        title_text_meta = meta_tag_content.get('content', '')  # Empty string if no meta tag found
        # Further filtering of unwanted characters
        # Alphanumeric characters, punctuation and alphanumeric
        filter_regex = re.compile(r'[^a-zA-Z0-9\ ]')
        filter_title_text = filter_regex.sub('', title_text).lower()
        filter_title_text_h1 = filter_regex.sub('', title_text_h1).lower()
        filter_title_text_meta = filter_regex.sub('', title_text_meta).lower()
        
        # Case1: If both matches don't do anything
        if title_text_h1 == title_text:
            used_delimeter = True
        # Case2: h1 and meta tag matches(either of h1 or meta)
        elif filter_title_text_h1 and filter_title_text_h1 == filter_title_text_meta:
            title_text = title_text_h1
            used_delimeter = True
        # Case3: If both h1 and meta are a substring of title_text(use h1)
        elif filter_title_text_h1 and filter_title_text_h1 in filter_title_text and filter_title_text_meta in filter_title_text  and len(title_text_h1) > len(title_text_meta):
            title_text = title_text_h1
            used_delimeter = True
        # Case4: If title_text startswith meta text(replace with meta)
        elif filter_title_text_meta and filter_title_text_meta != filter_title_text and filter_title_text.startswith(filter_title_text_meta):
            title_text = title_text_meta
            used_delimeter = True
        
        # If none of the above condition is matched, means a delimiter must be present between them
        # Now individually parts separated by delimiter has to be checked and now we check with h1 tag only(no meta tag)-Observation based
        if not used_delimeter and '|' in title_text:
            title_text = self.split_title(title_text, '|', title_text_h1)
            used_delimeter = True
    
        # self.split title with -
        if not used_delimeter and '-' in title_text:
            title_text = self.split_title(title_text, '-', title_text_h1)
            used_delimeter = True
    
        # self.split title with _
        if not used_delimeter and '_' in title_text:
            title_text = self.split_title(title_text, '_', title_text_h1)
            used_delimeter = True
    
        # self.split title with /
        if not used_delimeter and '/' in title_text:
            title_text = self.split_title(title_text, '/', title_text_h1)
            used_delimeter = True
    
        # self.split title with »
        if not used_delimeter and ' » ' in title_text:
            title_text = self.split_title(title_text, ' » ', title_text_h1)
            used_delimeter = True
        return title_text
        
    


In [61]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlencode

url="https://cnalifestyle.channelnewsasia.com/entertainment/nurul-aini-sofian-roslan-fatin-amira-suria-actress-scandal-383591#mdcrecs_s"
proxies = {
"http": "http://scraperapi:8355bf750256f87924cb321115d06996@proxy-server.scraperapi.com:8001"
}
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

response = requests.get(url,headers=headers,proxies=proxies,verify=False)
API_KEY = "8355bf750256f87924cb321115d06996"
params = {'api_key': API_KEY, 'url': url}
response = requests.get('http://api.scraperapi.com/', params=urlencode(params))
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

# print(soup)
# Create an instance of CustomExtractor
extractor = CustomExtractor()


authors_list=extractor.get_authors(soup);
print("authors  are")

if authors_list:
    
    for author in authors_list:
        print(author)
       
else:
    print("No authors found.")
date=extractor.publishing_date(url,soup)

print("date of publication:",date)

title_list=extractor.get_title(soup);
print("title  is/are")

print(extractor.get_title(soup))


# Calculate best node based on custom gravity scores
best_node = extractor.calculate_best_node(soup)

print("\n\n")
# Now, you can access the best node and its gravity score
if best_node:
    print(f"Best Node: {best_node['class']}, Gravity Score: {best_node.get('score', 0)}")
    print(best_node.get_text(strip=True))
else:
    print("No best node found.")




authors  are
Jiamun Koh
22 Jan 2024 10:01AM(Updated: 22 Jan 2024 03:53PM)
date of publication: 2024-01-22 10:01:00
title  is/are
Actress Nurul Aini responds to viral video of husband allegedly at hotel with another actress



Best Node: ['dialog-off-canvas-main-canvas'], Gravity Score: 7789.21875
Edition MenuEdition:Go to CNACNA LuxuryHamburger MenuCloseMain navigationEntertainmentEntertainmentEntertainmentZoe Tay celebrates her 56th birthday with residents from Lee Ah Mooi Old Age HomeEntertainmentAdditional tickets for Bruno Mars, Taylor Swift concerts in Singapore to go on sale on Jan 25EntertainmentActress Nurul Aini responds to viral video of husband checking into hotel with another actressEntertainmentYES 933 DJ Hazelle Teo spent S$40,000 to produce her Chinese New Year songCelebrityCulture & TrendsMusicTelevision & MoviesWomenWomenWomenHow Malay bridal entrepreneur Yumi Ayummi became a hit among millennials and Gen Z newlyweds-to-beWomenMeet the 39-year-old professional dancer w