In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import re
from natsort import natsorted

In [2]:
def parse_val(text):
    if not text: return 0
    text = text.upper().strip()
    
    # Extract numeric part and multiplier
    match = re.search(r'(\d+\.?\d*)([KMB]?)', text)
    if not match: return 0
    
    number = float(match.group(1))
    multiplier = match.group(2)
    
    if multiplier == 'K': number *= 1000
    elif multiplier == 'M': number *= 1000000
    elif multiplier == 'B': number *= 1000000000
    
    return int(number)

In [3]:
def parse_reaction_count(story):
    # Case 1: "Person A, Person B and 18K others"
    node_others = story.find('span', string=lambda x: x and 'others' in x.lower())
    if node_others:
        text = node_others.get_text(strip=True)
        # Count names before 'and'
        names_part = text.split(' and ')[0]
        individual_names_count = len(names_part.split(',')) if ',' in names_part or ' ' in names_part else 1
        return parse_val(text) + individual_names_count

    # Case 2: "All reactions: 2.5K" (Your new snippet)
    # Look for the 'All reactions' text or the specific class used for shorthand counts
    all_reactions_btn = story.find('div', string=re.compile("All reactions", re.I))
    if all_reactions_btn:
        # Find the sibling/parent span that contains the count (class xt0b8zv from your snippet)
        count_node = all_reactions_btn.find_parent().find('span', class_='xt0b8zv')
        if count_node:
            return parse_val(count_node.get_text(strip=True))

    return 0

In [4]:
def parse_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        html_content = f.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    stories = soup.select('div[data-focus="feed_story"]')

    data_list = []


    for story in stories:
        # --- Content ---
        content_node = (story.find('span', {'data-ad-rendering-role': 'description'}) or
                        story.find('div', {'data-ad-comet-preview': 'message'}) or 
                        story.find('div', {'data-ad-preview': 'message'}))
        content = content_node.get_text(strip=True) if content_node else "N/A"

        # --- Reactions ---
        reaction_count = parse_reaction_count(story)

        # --- Comments ---
        comment_node = story.find('span', string=lambda x: x and 'comments' in x.lower())
        comment_text = comment_node.get_text(strip=True) if comment_node else "0"
        comments_count = ''.join(filter(str.isdigit, comment_text)) or "0"

        # --- Shares ---
        share_node = story.find('span', string=lambda x: x and 'share' in x.lower())
        share_text = share_node.get_text(strip=True) if share_node else "0"
        shares_count = ''.join(filter(str.isdigit, share_text)) or "0"

        print (content)
        print (reaction_count)
        print (comments_count)
        print (shares_count)
        print ('\n')
        return {content, reaction_count, comments_count, shares_count}

In [6]:
directory_path = '../harvested/'
entries = os.listdir(directory_path)

for e in entries:
    files = natsorted(os.listdir(os.path.join(directory_path, e)))
    
    print(e)
    for f in files:
        relative_path = os.path.join(directory_path, e, f)
        print(relative_path)
        parse_html(relative_path)

nebresultandnews0
../harvested/nebresultandnews0/[0].html
एमालेको झण्डा जलाएको विरोधमा युवा संघले देशभर राँ!के जु!लुस निकाल्ने भएको छ।
3100
201
8


../harvested/nebresultandnews0/[1].html
Stay Healthy: काठमाडौं उपत्यकामा अत्यधिक वायु प्रदूषण बढेको छ। बिनाकाम घर बाहिर नजानुहोला, जानैपरे अनिवार्य मास्क लगाउने गरौं !
1200
28
2


../harvested/nebresultandnews0/[2].html
सर्वोच्च अदालतले संसद विघटनविरुद्धको रिटलाई चुनावको भोलिपल्ट अर्थात् फागुन २२ गते तारेख दिएको छ।
5100
145
32


../harvested/nebresultandnews0/[3].html
Balen Shah को नागरिकता रद्द गर्न माग गर्दै Yubaraj Saphal ले गृह मन्त्रालयमा उजुरी दिएका छन्।
9600
590
37


../harvested/nebresultandnews0/[4].html
मनका धनी: TikToker Coolboyyy ले क!लेजोका बि!रामीलाई उपचारका लागि १ ला!ख रुपै!याँ सह!योग गरेका छन्।
7400
79
2


../harvested/nebresultandnews0/[5].html
गरिबीलाइ रोकेन चुनाबले: गएको 7 महिनामा 4 लाख 72 हजार नेपालीले रोज’गारीका लागि नेपाल छोडेका छन।
5500
138
28


../harvested/nebresultandnews0/[6].html
Kathmandu ma clz padhne kt lai pi