In [1]:
import numpy as np
import pandas as pd
import feedparser
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta, timezone
from dateutil import parser
import time
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain.schema import Document as LangchainDocument

Initialize HuggingFace BGE model for text embeddings

In [2]:
model_name = "BAAI/bge-m3"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # True: Normalize for cosine similarity
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


# Read Labelled News for Enbedding Vectors

In [3]:
# Load labeled news data for training embedding vectors
mdata = pd.read_csv('NewsDataSave.csv', na_filter=False)

# Dictionary to store embeddings by category
ev_cat = {}

# Generate embedding vectors for each category based on labeled data
for cat in mdata.Categories.unique():

    print(cat)

    ev_list = []

    for neg_sample in [0, 1]:

        if neg_sample == 0:
            print("Positive Sample...")
        else:
            print("Negative Sample...")

        sample_list = []

        for url in mdata.loc[mdata.Categories == cat].loc[mdata.Negative == neg_sample]['Links']:

            if mdata.loc[mdata.Categories == cat].loc[mdata.Negative == neg_sample].loc[mdata.Links == url]['Summary'].values[0] == "":

                raise ValueError("No text summary!")

            else:
                title = mdata.loc[mdata.Categories == cat].loc[mdata.Negative == neg_sample].loc[mdata.Links == url]['Titles'].values[0]
                content = mdata.loc[mdata.Categories == cat].loc[mdata.Negative == neg_sample].loc[mdata.Links == url]['Summary'].values[0]

            # Create embeddings using title and content
            sample_list.append(embeddings.embed_query(title + ' ' + content))
            # print(title + ' ' + content)

        ev_list.append(sample_list)

    ev_cat[cat] = ev_list


Elderly
Positive Sample...


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Negative Sample...
Housing
Positive Sample...
Negative Sample...
Land Development
Positive Sample...
Negative Sample...
Link REIT
Positive Sample...
Negative Sample...
Pension
Positive Sample...
Negative Sample...
Property
Positive Sample...
Negative Sample...
Public Finance
Positive Sample...
Negative Sample...
Urban Renewal
Positive Sample...
Negative Sample...


# Read News Sources

## Read Mingpao News

Process Mingpao RSS Feeds

In [4]:
# List of RSS feed URLs from Mingpao
rss_list = ["https://news.mingpao.com/rss/pns/s00001.xml", #要聞
            "https://news.mingpao.com/rss/pns/s00002.xml", #港聞
            "https://news.mingpao.com/rss/pns/s00003.xml", #社評
            "https://news.mingpao.com/rss/pns/s00004.xml", #經濟
            "https://news.mingpao.com/rss/pns/s00005.xml", #副刊
            "https://news.mingpao.com/rss/pns/s00011.xml", #教育
            "https://news.mingpao.com/rss/pns/s00012.xml", #觀點
            "https://news.mingpao.com/rss/pns/s00013.xml", #中國
            "https://news.mingpao.com/rss/pns/s00014.xml", #國際
            "https://news.mingpao.com/rss/pns/s00015.xml", #體育
            "https://news.mingpao.com/rss/pns/s00016.xml", #娛樂
            "https://news.mingpao.com/rss/pns/s00017.xml", #英文
            "https://news.mingpao.com/rss/pns/s00018.xml" #作家專欄
            ]

# Parse all RSS feeds and aggregate entries
feed_list = []
for rss in rss_list:
    feed = feedparser.parse(rss)
    feed_list += feed.entries

Filter for today's news only

In [5]:
today = datetime.now(timezone(timedelta(hours=8)))
print("Today's date:", today.date())
today = today.replace(hour=0, minute=0, second=0, microsecond=0).astimezone(timezone(timedelta(hours=0)))


feed_list = [feed for feed in feed_list if parser.parse(feed.published) >= today]

print('No. of news:', len(feed_list))

Today's date: 2025-07-03
No. of news: 141


Convert RSS entries to LangChain document format

In [6]:
documents = []
for entry in feed_list:
    content = entry.title + " " + entry.summary
    metadata = {
        "source": "/".join(entry.link.split("/")[:-1]),
        "title": entry.title,
        "newspaper": "明報"
    }
    documents.append(LangchainDocument(page_content=content, metadata=metadata))

 Build vector store for Mingpao documents

In [7]:
db = FAISS.from_documents(documents, embeddings, distance_strategy=DistanceStrategy.COSINE) #EUCLIDEAN_DISTANCE/MAX_INNER_PRODUCT/DOT_PRODUCT/JACCARD/COSINE

db.distance_strategy

<DistanceStrategy.COSINE: 'COSINE'>

## Read Oriental Daily News

Scrape Oriental Daily News Site

In [8]:
today = datetime.now()

url = r'https://orientaldaily.on.cc/section/sitemap/' + today.date().strftime('%Y%m%d')

response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')

items = soup.find_all('li', attrs={'class': 'item'})

Scrape individual news articles

In [9]:
on_list = []

for item in items:
    title = item.text
    link = r'https://orientaldaily.on.cc' + item.find('a')['href']

    try:
        response = requests.get(link)
    except:
        time.sleep(3)
        response = requests.get(link)

    soup = BeautifulSoup(response.text, 'html.parser')
    paras = soup.find_all('div', class_ = 'paragraph')

    content = ''

    for p in paras:
        content += p.get_text()

    content = ''.join(content.split())

    on_list.append([title, link, content])

In [10]:
print('No. of news:', len(on_list))

No. of news: 193


Convert to LangChain document format

In [11]:
on_documents = []
for entry in on_list:
    content = entry[0] + ' ' + entry[2]
    metadata = {
        "source": "/".join(entry[1].split("/")[:-1]) + "/",
        "title": entry[0],
        "newspaper": "東方日報"
    }
    on_documents.append(LangchainDocument(page_content=content, metadata=metadata))

In [12]:
on_db = FAISS.from_documents(on_documents, embeddings, distance_strategy=DistanceStrategy.COSINE) #EUCLIDEAN_DISTANCE/MAX_INNER_PRODUCT/DOT_PRODUCT/JACCARD/COSINE

on_db.distance_strategy

<DistanceStrategy.COSINE: 'COSINE'>

## Read HK Gov News

Parse HK Government News Feed

In [13]:
rss_list_hkgov = ["https://www.info.gov.hk/gia/rss/general_zh.xml"] #新聞公報

feed_list_hkgov = []

# Parse HK Gov RSS feeds
for rss in rss_list_hkgov:
    feed = feedparser.parse(rss)
    feed_list_hkgov += feed.entries

Filter for news from the last 24 hours

In [14]:
yesterday = datetime.now(timezone(timedelta(hours=8))) - timedelta(hours=24)
print("Yesterday's date:", yesterday.date())
yesterday = yesterday.replace(hour=0, minute=0, second=0, microsecond=0).astimezone(timezone(timedelta(hours=0)))


feed_list_hkgov = [feed for feed in feed_list_hkgov if parser.parse(feed.published) >= yesterday]

print('No. of news:', len(feed_list_hkgov))

Yesterday's date: 2025-07-02
No. of news: 50


Convert to LangChain document format

In [15]:
hkgov_documents = []
for entry in feed_list_hkgov:
    soup = BeautifulSoup(entry.summary, 'html.parser')
    entry_summary = soup.text.replace("\n", "")

    content = entry.title + ' ' + entry_summary
    metadata = {
        "source": entry.link,
        "title": entry.title,
        "newspaper": "新聞公報"
    }
    hkgov_documents.append(LangchainDocument(page_content=content, metadata=metadata))

In [16]:
gia_db = FAISS.from_documents(hkgov_documents, embeddings, distance_strategy=DistanceStrategy.COSINE) #EUCLIDEAN_DISTANCE/MAX_INNER_PRODUCT/DOT_PRODUCT/JACCARD/COSINE

gia_db.distance_strategy

<DistanceStrategy.COSINE: 'COSINE'>

# Perform Similarity Searches

## Search for relevant Mingpao news articles

In [17]:
for cat in mdata.Categories.unique():

    print(cat)

    ev_pos = np.sum(ev_cat[cat][0], axis=0)
    ev_neg = np.sum(ev_cat[cat][1], axis=0)
    ev = ev_pos - ev_neg
    ev = ev / np.linalg.norm(ev)
    for item in db.similarity_search_with_score_by_vector(ev, k=10, score_threshold=0.7):
        print(str(item[1]) + ": " + item[0].metadata["title"] + " " + item[0].metadata["source"])

Elderly
Housing
0.6710038: 王柏林、牛致行、宋恩榮：居屋二手市場流通窒息 買賣限制須大幅放寬 https://news.mingpao.com/pns/%e8%a7%80%e9%bb%9e/article/20250703/s00012/1751474124497
Land Development
0.6205528: 新地米埔申建2322伙 規劃署不反對  較12年前獲批71幢洋房大增31倍 https://news.mingpao.com/pns/%e7%b6%93%e6%bf%9f/article/20250703/s00004/1751474133250
Link REIT
Pension
Property
0.6610782: 前環球唱片黃劍濤928萬沽囍滙1房戶 https://news.mingpao.com/pns/%e7%b6%93%e6%bf%9f/article/20250703/s00004/1751474134191
0.66451335: 先達120呎舖150萬沽  20年蝕44% https://news.mingpao.com/pns/%e7%b6%93%e6%bf%9f/article/20250703/s00004/1751474134284
0.6896319: 德祥上鄉道工廈轉住宅 每呎補價1060元 https://news.mingpao.com/pns/%e7%b6%93%e6%bf%9f/article/20250703/s00004/1751474133612
Public Finance
Urban Renewal


## Search for relevant Oriental Daily news

In [18]:
for cat in mdata.Categories.unique():

    print(cat)

    ev_pos = np.sum(ev_cat[cat][0], axis=0)
    ev_neg = np.sum(ev_cat[cat][1], axis=0)
    ev = ev_pos - ev_neg
    ev = ev / np.linalg.norm(ev)
    for item in on_db.similarity_search_with_score_by_vector(ev, k=10, score_threshold=0.7):
        print(str(item[1]) + ': ' + item[0].metadata["title"] + " " + item[0].metadata["source"])

Elderly
Housing
0.50589514: 推行簡樸房  過渡屋倡增「丙類租戶」 https://orientaldaily.on.cc/content/要聞港聞/odn-20250703-0703_00176_043/
0.655644: 私人樓宇滲漏嚴重  3年接投訴13萬宗 https://orientaldaily.on.cc/content/要聞港聞/odn-20250703-0703_00176_041/
0.66969156: 收回精進3地盤  房會覓承建商 https://orientaldaily.on.cc/content/要聞港聞/odn-20250703-0703_00176_044/
Land Development
0.61173785: 議員憂變「特特區」  北都管理局告吹 https://orientaldaily.on.cc/content/要聞港聞/odn-20250703-0703_00176_201/
0.66903806: 收回精進3地盤  房會覓承建商 https://orientaldaily.on.cc/content/要聞港聞/odn-20250703-0703_00176_044/
0.6820229: 地產速報一覽 https://orientaldaily.on.cc/content/產經/odn-20250703-0703_00204_048/
Link REIT
Pension
Property
0.47273552: 地產速報一覽 https://orientaldaily.on.cc/content/產經/odn-20250703-0703_00204_048/
0.49120227: 公居屋成交：銀河苑未補價賣185萬  僅升9.7% https://orientaldaily.on.cc/content/產經/odn-20250703-0703_00204_043/
0.5411675: 二手低價：尚悅特色銀主盤1125萬拍出  7載貶42% https://orientaldaily.on.cc/content/產經/odn-20250703-0703_00204_042/
0.60631007: 港銀或需3年化解地產商信貸風險 https://orientaldaily.on.cc/

## Search for relevant HK Gov news

In [19]:
for cat in mdata.Categories.unique():

    print(cat)

    ev_pos = np.sum(ev_cat[cat][0], axis=0)
    ev_neg = np.sum(ev_cat[cat][1], axis=0)
    ev = ev_pos - ev_neg
    ev = ev / np.linalg.norm(ev)
    for item in gia_db.similarity_search_with_score_by_vector(ev, k=10, score_threshold=0.7):
        print(str(item[1]) + ': ' + item[0].metadata["title"] + " " + item[0].metadata["source"])

Elderly
0.6297854: ​立法會十題：打擊濫用公共福利及公屋 https://www.info.gov.hk/gia/general/202507/02/P2025070200297.htm
0.68255764: 立法會七題：優化強制性公積金制度 https://www.info.gov.hk/gia/general/202507/02/P2025070200345.htm
Housing
0.40112507: ​立法會十題：打擊濫用公共福利及公屋 https://www.info.gov.hk/gia/general/202507/02/P2025070200297.htm
0.45401752: 立法會九題：資助出售房屋的轉讓限制措施 https://www.info.gov.hk/gia/general/202507/02/P2025070200384.htm
0.6178713: 房委會收回三個建築地盤 https://www.info.gov.hk/gia/general/202507/02/P2025070200401.htm
0.62069786: 立法會三題：推動民宿發展 https://www.info.gov.hk/gia/general/202507/02/P2025070200411.htm
0.6592362: ​立法會十四題：滲水投訴調查聯合辦事處 https://www.info.gov.hk/gia/general/202507/02/P2025070200455.htm
Land Development
0.51930326: 立法會︰發展局局長就「設立創新體制機制，全速推進『北部都會區核心區』的發展」動議議案開場發言（只有中文） https://www.info.gov.hk/gia/general/202507/02/P2025070200703.htm
0.5913238: ​立法會二十題：跨境海上旅遊 https://www.info.gov.hk/gia/general/202507/02/P2025070200637.htm
0.597106: ​立法會︰發展局局長就「設立創新體制機制，全速推進『北部都會區核心區』的發展」動議議案總結發言（只有中文） https://www.info.gov.hk/gi