In [58]:
import json
import time
import re
from urllib.parse import urljoin, urlparse, parse_qs, urlencode
import requests
from bs4 import BeautifulSoup

In [59]:
COOKIES_FILE = "cookies.json"         # file do fb_save_cookies.py tạo ra
OUTPUT_FILE = "hnmu_posts_3.json"
PAGE_M_BASIC = "https://mbasic.facebook.com/profile.php?id=61555234277669&v=timeline&__mmr=1&_rdr"

In [60]:
# ---------- cookie loader (hỗ trợ Selenium output) ----------
def load_cookies_to_session(session, cookies_file):
    with open(cookies_file, "r", encoding="utf-8") as f:
        cookies = json.load(f)
    count = 0
    for c in cookies:
        name = c.get("name")
        value = c.get("value")
        if not name or value is None:
            continue
        domain = c.get("domain")
        path = c.get("path", "/")
        # requests' cookies.set chấp nhận domain param; nếu domain None -> no domain set
        if domain:
            session.cookies.set(name, value, domain=domain, path=path)
        else:
            session.cookies.set(name, value, path=path)
        count += 1
    print(f"Loaded {count} cookies into session (cookie jar size: {len(session.cookies)})")

In [61]:
# ---------- check login ----------
def is_logged_in(session, test_url="https://mbasic.facebook.com/"):
    try:
        r = session.get(test_url, timeout=20)
    except Exception as e:
        return False, f"request error: {e}"
    if r.status_code != 200:
        return False, f"status {r.status_code}"
    html = r.text.lower()
    # heuristic: nếu thấy "log in" form hoặc "email or phone" thì là logged out
    if "log in to facebook" in html or "email or phone" in html or "đăng nhập" in html:
        return False, "login page detected"
    # else assume logged in
    return True, "ok"

In [62]:
MOBILE_UA = (
    "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) "
    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1"
)

BASE_HEADERS = {
    "User-Agent": MOBILE_UA,
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "vi-VN,vi;q=0.9,en-US;q=0.8,en;q=0.7",
    "Cache-Control": "no-cache",
    "Pragma": "no-cache",
}

def get_html(session, url, timeout=20):
    r = session.get(url, timeout=timeout, allow_redirects=True)
    txt = r.text
    # Nếu gặp trang cảnh báo -> thử lại với UA khác (Android)
    if ("facebook is not available on this browser" in txt.lower()
        or "/help/1570260493519783" in txt.lower()):
        session.headers["User-Agent"] = (
            "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/124.0 Mobile Safari/537.36"
        )
        r = session.get(url, timeout=timeout, allow_redirects=True)
    r.raise_for_status()
    return r.text


In [63]:
# ---------- helpers ----------
def clean_post_url(url):
    if not url:
        return url
    p = urlparse(url)
    qs = parse_qs(p.query)
    for k in list(qs.keys()):
        if k.startswith("__cft__") or k in {"__tn__", "comment_id", "mibextid", "refid"}:
            qs.pop(k, None)
    q = urlencode(qs, doseq=True)
    return f"{p.scheme}://{p.netloc}{p.path}" + (f"?{q}" if q else "")


In [64]:
TIME_RE = re.compile(r"^\d+\s*(s|m|h|d|w|y)$", re.I)
NOISE_WORDS = {"like","reply","share","comment","send","follow","thích","trả lời","chia sẻ","bình luận"}

In [65]:
def is_noise_line(s):
    s = s.strip()
    if not s:
        return True
    if TIME_RE.match(s):
        return True
    if s.isdigit():
        return True
    if s.lower() in NOISE_WORDS:
        return True
    if len(s) <= 2:
        return True
    return False

In [66]:
# ---------- parsing per-article ----------
def extract_segments_from_article(soup_article):
    """
    Trả về danh sách đoạn text (post + inline comments) đã lọc rác.
    """
    segments = []
    used = set()

    # Thử ưu tiên vùng story_body_container nếu có
    body = soup_article.select_one("div.story_body_container")
    candidates = body.find_all(["div","span"], recursive=True) if body else soup_article.find_all(["div","span"], recursive=True)

    for el in candidates:
        try:
            text = el.get_text(separator=" ", strip=True)
            if not text:
                continue
            text = re.sub(r"\s+", " ", text).strip()
            if is_noise_line(text):
                continue
            if text in used:
                continue
            used.add(text)
            segments.append(text)
        except Exception:
            continue

    # Fallback: nếu không có gì, lấy toàn bộ article text
    if not segments:
        all_text = soup_article.get_text(separator="\n", strip=True)
        all_text = re.sub(r"\s+", " ", all_text).strip()
        if all_text:
            segments = [all_text]
    return segments

In [67]:
POST_HREF_PATTERNS = (
    "/permalink.php", "/story.php", "/posts/", "/photo.php", "/video.php", "/groups/"
)

def is_post_link(href: str) -> bool:
    href = (href or "").lower()
    if "help/1570260493519783" in href:  # trang cảnh báo
        return False
    return any(p in href for p in POST_HREF_PATTERNS)


In [68]:
def parse_feed_page(html, base_url):
    soup = BeautifulSoup(html, "html.parser")
    cards = soup.select("article, div[data-ft], div[role='article']")
    results = []
    for art in cards:
        # tìm anchor nào là permalink thật sự
        cand = None
        for a in art.find_all("a", href=True):
            if is_post_link(a["href"]):
                cand = a; break
        if not cand:
            continue  # bỏ các block header/footer/logo/help

        post_url = clean_post_url(urljoin(base_url, cand["href"]))
        segments = extract_segments_from_article(art)
        if segments:
            results.append({
                "post_url": post_url,
                "segments": segments,
                "post_text": "\n".join(segments)
            })
    return results

In [69]:
def main():
    s = requests.Session()
    s.headers.update(BASE_HEADERS)
    load_cookies_to_session(s, COOKIES_FILE)

    ok, note = is_logged_in(s)
    if not ok:
        print("Cookies không hợp lệ/đã hết hạn. Hãy chạy fb_save_cookies.py để tạo cookies mới.")
        return

    html = get_html(s, PAGE_M_BASIC)
    results = parse_feed_page(html, PAGE_M_BASIC)

    print("Requesting feed:", PAGE_M_BASIC)
    r = s.get(PAGE_M_BASIC, timeout=20)
    if r.status_code != 200:
        print("Request failed:", r.status_code)
        return

    results = parse_feed_page(r.text, PAGE_M_BASIC)
    print("Found posts:", len(results))
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print("Saved to", OUTPUT_FILE)

In [70]:
if __name__ == "__main__":
    main()

Loaded 8 cookies into session (cookie jar size: 8)
Requesting feed: https://mbasic.facebook.com/profile.php?id=61555234277669&v=timeline&__mmr=1&_rdr
Found posts: 0
Saved to hnmu_posts_3.json
