In [20]:
# Text Rank

In [21]:
!pip install rouge_score



In [22]:
import numpy as np
import pandas as pd
import re
from rouge_score import rouge_scorer

In [23]:
def split_sentences(text):
    sentences = re.split(r'[.!?]', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

In [24]:
def sentence_similarity(s1, s2):
    words1 = set(s1.lower().split())
    words2 = set(s2.lower().split())
    if len(words1) == 0 or len(words2) == 0:
        return 0
    return len(words1 & words2) / (np.log(len(words1)+1) + np.log(len(words2)+1))


In [25]:
def build_similarity_matrix(sentences):
    n = len(sentences)
    sim_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if i != j:
                sim_matrix[i][j] = sentence_similarity(sentences[i], sentences[j])
    return sim_matrix

In [26]:
def pagerank(sim_matrix, eps=0.0001, d=0.85):
    n = sim_matrix.shape[0]
    scores = np.ones(n) / n
    row_sum = sim_matrix.sum(axis=1)
    row_sum[row_sum == 0] = 1
    while True:
        new_scores = (1 - d) / n + d * sim_matrix.T.dot(scores / row_sum)
        if np.linalg.norm(new_scores - scores) < eps:
            break
        scores = new_scores
    return scores

In [27]:
def textrank(text, top_n=2):
    sentences = split_sentences(text)
    if len(sentences) == 0:
        return ""
    sim_matrix = build_similarity_matrix(sentences)
    scores = pagerank(sim_matrix)
    ranked = np.argsort(scores)[::-1]
    summary = [sentences[i] for i in ranked[:min(top_n, len(sentences))]]
    return ". ".join(summary) + "."


In [28]:
def evaluate_summaries(df):
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
    results = []
    for _, row in df.iterrows():
        reference = row["Summarize"]
        generated = row["summary_pred"]
        score = scorer.score(reference, generated)
        results.append(score)
    return results


In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
df = pd.read_csv("/content/drive/MyDrive/datasets/Nhap_mon_AI/datadone.csv", encoding='utf-8')

df = df.iloc[:10, :]

df = df.dropna(subset=["Content"])

df["summary_pred"] = df["Content"].apply(lambda x: textrank(str(x), top_n=3))

scores = evaluate_summaries(df)

print(df)
print("\nĐiểm ROUGE:", scores)

                                                Link  \
0  https://vnexpress.net/suat-hoc-va-con-ca-46266...   
1  https://vnexpress.net/du-hoc-de-lao-dong-chui-...   
2  https://vnexpress.net/ha-noi-co-rung-4625687.html   
3  https://vnexpress.net/quyen-tu-quyet-rut-bao-h...   
4  https://vnexpress.net/tro-choi-chung-minh-tai-...   
5  https://vnexpress.net/do-tien-xuong-bien-46243...   
6  https://vnexpress.net/gianh-viec-tu-tay-robot-...   
7  https://vnexpress.net/kiem-tien-tu-dat-4623773...   
8  https://vnexpress.net/streamer-chan-dat-462346...   
9  https://vnexpress.net/nguoi-nuoc-ngoai-mua-nha...   

                             Title                Category             Author  \
0               Suất học và con cá     Giáo dục & tri thức       Võ Nhật Vinh   
1        Du học để lao động 'chui'     Giáo dục & tri thức   Nguyễn Nam Cường   
2                   Hà Nội có rừng  Chính trị & chính sách  Phạm Hoàng Phương   
3    Quyền tự quyết 'rút bảo hiểm'  Chính trị & chính sách 