In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import re
from natsort import natsorted

def parse_val(text):
    if not text: return 0
    text = text.upper().strip()
    
    # Extract numeric part and multiplier
    match = re.search(r'(\d+\.?\d*)([KMB]?)', text)
    if not match: return 0
    
    number = float(match.group(1))
    multiplier = match.group(2)
    
    if multiplier == 'K': number *= 1000
    elif multiplier == 'M': number *= 1000000
    elif multiplier == 'B': number *= 1000000000
    
    return int(number)

def parse_reaction_count(story):
    # Case 1: "Person A, Person B and 18K others"
    node_others = story.find('span', string=lambda x: x and 'others' in x.lower())
    if node_others:
        text = node_others.get_text(strip=True)
        # Count names before 'and'
        names_part = text.split(' and ')[0]
        individual_names_count = len(names_part.split(',')) if ',' in names_part or ' ' in names_part else 1
        return parse_val(text) + individual_names_count

    # Case 2: "All reactions: 2.5K" (Your new snippet)
    # Look for the 'All reactions' text or the specific class used for shorthand counts
    all_reactions_btn = story.find('div', string=re.compile("All reactions", re.I))
    if all_reactions_btn:
        # Find the sibling/parent span that contains the count (class xt0b8zv from your snippet)
        count_node = all_reactions_btn.find_parent().find('span', class_='xt0b8zv')
        if count_node:
            return parse_val(count_node.get_text(strip=True))

    return 0

def parse_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        html_content = f.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    stories = soup.select('div[data-focus="feed_story"]')

    data_list = []


    for story in stories:
        # --- Content ---
        content_node = (story.find('span', {'data-ad-rendering-role': 'description'}) or
                        story.find('div', {'data-ad-comet-preview': 'message'}) or 
                        story.find('div', {'data-ad-preview': 'message'}))
        content = content_node.get_text(strip=True) if content_node else "N/A"

        # --- Reactions ---
        reaction_count = parse_reaction_count(story)

        # --- Timestamp ---
        ts_node = soup.find('span', class_='custom-timestamp')
        timestamp = ts_node.get_text(strip=True) if ts_node else "Unknown"

        # --- Comments ---
        comment_node = story.find('span', string=lambda x: x and 'comments' in x.lower())
        comment_text = comment_node.get_text(strip=True) if comment_node else "0"
        comments_count = ''.join(filter(str.isdigit, comment_text)) or "0"

        # --- Shares ---
        share_node = story.find('span', string=lambda x: x and 'share' in x.lower())
        share_text = share_node.get_text(strip=True) if share_node else "0"
        shares_count = ''.join(filter(str.isdigit, share_text)) or "0"

        print (content)
        print (timestamp)
        print (reaction_count)
        print (comments_count)
        print (shares_count)
        print ('\n')
        return {content, reaction_count, comments_count, shares_count}

In [2]:
directory_path = '../harvested/'
entries = os.listdir(directory_path)

for e in entries:
    files = natsorted(os.listdir(os.path.join(directory_path, e)))
    
    print(e)
    for f in files:
        relative_path = os.path.join(directory_path, e, f)
        print(relative_path)
        parse_html(relative_path)

officialroutineofnepalbanda
../harvested/officialroutineofnepalbanda/[0].html
Event Alert: IDP ले भोली (21st Feb) UK Admission Day organize गर्दै छ जहाँ २०+ high ranking universities बाट directly interact गरी scholarships, top universities, र top courses का बारेमा जान्कारी पाउन सक्नु हुन्छ । Starting from 11 am to 4 pm.#CollabAdRegistration:https://forms.office.com/r/NLDSyPQPEh
Unknown
0
0
0


../harvested/officialroutineofnepalbanda/[1].html
मन शान्त बनाउने आरती: Sandhya Aarati in Pashupatinath Mandir!Photo. Nabin Sapkota
Friday, February 20, 2026 at 7:16 PM
3600
77
7


../harvested/officialroutineofnepalbanda/[2].html
"म तपाईँको शासक बन्न होइन, तपाईँकै छोरा र सेवक बनेर हिंजोसम्म सडकमा ल-ड्ने म तिनै आवाजलाइ संसदसम्म पुर्याउन आएको हुँ।" भन्दै बागलुङ - २ बाट उम्मेदवारी दिएका सोम शर्माले आफ्नो घरदैलो अभियान सन्चालन गरिरहेका छन ।
Friday, February 20, 2026 at 7:25 PM
1300
61
15


../harvested/officialroutineofnepalbanda/[3].html
लुकेका माया र त्यागका कथा हरेक गाउँ, शहर, टोलमा छन् , सुरुको 