<a href="https://colab.research.google.com/github/maianhtran2005/Project_AI/blob/main/Clean_chunked_filtered.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Lấy dữ liệu

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv("/content/drive/My Drive/AI/data_cleaned.csv", nrows=3)
df.head(10)
df.columns

Mounted at /content/drive


Index(['input', 'output', 'sentences', 'input_sentences', 'ratio'], dtype='object')

#Làm sạch

In [None]:
import pandas as pd
import re
from bs4 import BeautifulSoup

# === Đọc dữ liệu ===
df = pd.read_csv("/content/drive/My Drive/AI/data_cleaned.csv")

# === Đọc danh sách từ dừng ===
with open("/content/drive/My Drive/AI/vietnamese-stopwords.txt", "r", encoding="utf-8") as f:
    stopwords = set(w.strip() for w in f if w.strip())

# === Hàm làm sạch ngôn ngữ báo chí ===
def clean_text(text):
    if pd.isna(text):
        return ""
    # 1. Loại bỏ HTML tags
    soup = BeautifulSoup(str(text), "lxml")
    for t in soup(["script", "style", "noscript", "iframe", "footer", "header", "nav"]):
        t.decompose()
    text = soup.get_text(separator=" ")

    # 2. Xóa các dòng kiểu "Theo", "Nguồn", "Ảnh", "Video"
    text = re.sub(r"(?im)\b(nguồn|theo|tác giả|tin liên quan|ảnh|video|đọc thêm|bài liên quan)\b.*", "", text)

    # 3. Xóa URL, email, số điện thoại
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"\b[\w.+-]+@[\w.-]+\.\w+\b", "", text)
    text = re.sub(r"\+?\d{2,4}[-.\s]?\d{3,}([-.\s]?\d+)*", "", text)

    # 4. Chuẩn hóa khoảng trắng, ký tự đặc biệt
    text = re.sub(r"[^0-9A-Za-zÀ-ỹ.,?!:;\"'()\-\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# === Hàm loại bỏ từ dừng ===
def remove_stopwords(text):
    words = text.split()
    filtered = [w for w in words if w.lower() not in stopwords]
    return " ".join(filtered)

# === Áp dụng làm sạch trên cột input ===
df["input_cleaned"] = df["input"].apply(clean_text)

# Giữ lại các cột quan trọng
df_cleaned = df[["input_cleaned", "output", "ratio"]]

# === Lưu lại ===
output_path = "/content/drive/My Drive/AI/data_cleaned_processed.csv"
df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f" File kết quả: {output_path}")

 File kết quả: /content/drive/My Drive/AI/data_cleaned_processed.csv


#Chuyển hóa sang Unicode

In [None]:
!pip install underthesea

Collecting underthesea
  Downloading underthesea-8.3.0-py3-none-any.whl.metadata (14 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea_core==1.0.5 (from underthesea)
  Downloading underthesea_core-1.0.5-cp312-cp312-manylinux2010_x86_64.whl.metadata (1.4 kB)
Downloading underthesea-8.3.0-py3-none-any.whl (8.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading underthesea_core-1.0.5-cp312-cp312-manylinux2010_x86_64.whl (978 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m978.4/978.4 kB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.3 MB/s[0m eta [36m0:0

In [None]:
import pandas as pd
import unicodedata
import re
from underthesea import sent_tokenize
from tqdm import tqdm

# === Đọc file đã làm sạch ===
df = pd.read_csv("/content/drive/My Drive/AI/data_cleaned_processed.csv")

# === Hàm chuẩn hóa Unicode ===
def normalize_and_fix_spacing(text):
    if pd.isna(text):
        return ""
    text = unicodedata.normalize("NFC", str(text))
    # thêm khoảng trắng sau dấu chấm, chấm hỏi, chấm than nếu dính chữ
    text = re.sub(r"([.!?])(?=[^\s])", r"\1 ", text)
    # bỏ khoảng trắng thừa
    text = re.sub(r"\s+", " ", text).strip()
    return text
# === Hàm tách câu ===
def sentence_split(text):
    sentences = sent_tokenize(text)
    return " ".join(sentences)  # nối lại bằng khoảng trắng để vẫn 1 dòng

# === Áp dụng trên cột input_cleaned và output ===
tqdm.pandas()
df["input_cleaned"] = df["input_cleaned"].progress_apply(normalize_and_fix_spacing).progress_apply(sentence_split)
df["output"] = df["output"].progress_apply(normalize_and_fix_spacing).progress_apply(sentence_split)

# === Lưu lại ===
output_path = "/content/drive/My Drive/AI/data_cleaned_final.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")

print(f"File kết quả: {output_path}")


100%|██████████| 42833/42833 [00:04<00:00, 9512.86it/s]
100%|██████████| 42833/42833 [00:08<00:00, 4832.91it/s]
100%|██████████| 42833/42833 [00:12<00:00, 3343.76it/s]
100%|██████████| 42833/42833 [00:21<00:00, 2022.39it/s]


File kết quả: /content/drive/My Drive/AI/data_cleaned_final.csv


#Chuyển đổi sang file jsonl để train

In [None]:
import pandas as pd
import json
from tqdm import tqdm

# === Đọc dữ liệu ===
df = pd.read_csv("/content/drive/My Drive/AI/data_cleaned_final.csv")
df = df[(df["input_cleaned"].str.len() > 30) & (df["output"].str.len() > 10)]

# === Xử lý giá trị thiếu ===
df = df.fillna({"input_cleaned": "", "output": ""})

# === Hàm chuyển 1 dòng sang JSONL ===
def make_record(row):
    prompt = str(row["input_cleaned"]).strip()
    completion = str(row["output"]).strip()
    return {"prompt": prompt, "completion": " " + completion}

# === Ghi ra file JSONL ===
with open("/content/drive/My Drive/AI/train_data.jsonl", "w", encoding="utf-8") as f:
    for _, row in tqdm(df.iterrows(), total=len(df)):
        record = make_record(row)
        json.dump(record, f, ensure_ascii=False)
        f.write("\n")

100%|██████████| 38302/38302 [00:13<00:00, 2879.88it/s]
