## Kết nối Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Đọc dữ liệu

In [None]:
import pandas as pd

file_path = "/content/drive/MyDrive/Data Mining Field-oriented/consolidated/tokenized_articles_training.tsv"
output_path = "/content/drive/MyDrive/Data Mining Field-oriented/consolidated/cleaned_articles_training.tsv"

# Lọc tags không xuất hiện trong content

## Tiến hành lọc tags

In [None]:
import pandas as pd

def filter_tags(row):
    content_lower = row['content'].replace('_', ' ').lower()
    tags_lower = [tag.lower() for tag in row['tags'].split(',')]

    filtered_tags_lower = [tag for tag in tags_lower if tag in content_lower]

    original_tags = row['tags'].split(',')
    tag_map = {tag.lower(): tag for tag in original_tags}
    filtered_tags = [tag_map[tag] for tag in filtered_tags_lower]

    return ','.join(filtered_tags)

df = pd.read_csv(file_path, sep='\t',header=None)
df.columns = ['content', 'tags']

df['content'] = df['content'].fillna('')
df['tags'] = df['tags'].fillna('')

df['filtered_tags'] = df.apply(filter_tags, axis=1)

## Xuất file kết quả

In [None]:
df = df.drop(columns=['tags'])
df = df.rename(columns={'filtered_tags': 'tags'})

df.to_csv(output_path, sep="\t", index=False)

# Xử lý nhiễu

## Xử lý thông số đặc biệt

In [None]:
patterns = {
    'date': r'\b(?:0[1-9]|[12][0-9]|3[01])/(?:0[1-9]|1[0-2])/\d{4}\b|\b(?:0[1-9]|1[0-2])/(?:0[1-9]|[12][0-9]|3[01])/\d{4}\b',
    'long_numbers': r'\b\d{6,}(?:[.,]\d{3})*\b(?![A-Za-z])',
    # 'isbn_pattern': r'\b(?:\d{9}[\dXx]|\d{13})\b',
    # 'ip': r'\b(?:\d{1,3}\.){3}\d{1,3}\b|\b(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}\b', # For later purposes
    'percentage': r'\b\d+(?:,\d+)?%\b',
    'time': r'\b\d{1,2}h(?:\d{2})?\b|\b\d{1,2}:\d{2}\b|\b\d{1,2}\+\d{1,2}\b',
    'comments': r'(Ảnh|Nguồn)\s*:\s*(?:@\s*)?\w+',
    'mail' : r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b',
    'url' : r'\b(?:https?://|www\.)[^\s/$.?#].[^\s]*\b'
}

In [None]:
import re

def remove_patterns(text, patterns):
    for pattern in patterns.values():
        text = re.sub(pattern, '', text)
    return text

In [None]:
df = pd.read_csv(f'/content/drive/MyDrive/Data Mining Field-oriented/consolidated/cleaned_articles_training.tsv', sep='\t')
content_list = df['content'].tolist()

cleaned_content_list = df['content'].apply(lambda x: remove_patterns(x, patterns))
tags_list = df['tags'].tolist()

cleaned_df = pd.DataFrame({'content': cleaned_content_list, 'tags': tags_list})
cleaned_df.to_csv(f'/content/drive/MyDrive/Data Mining Field-oriented/consolidated/cleaned_articles_training.tsv', sep='\t', index=False)

## Xử lý ký tự đặc biệt

In [None]:
import re

def remove_special_char(text):
    text = re.sub(r'\s+([!#$%&’()*+\-/<=>?@\[\]^`{|}~“”\":.,;])', r'\1', text)

    special_char = "!#$%&’()*+-/<=>?@[\]^`{|}~“”\":±'"
    translation_table = str.maketrans("", "", special_char)
    text = text.translate(translation_table)

    text = re.sub(r'\.', '', text)
    text = re.sub(r'…', '', text)

    return text.strip()

## Tách câu

In [None]:
def process_paragraph(paragraph):
    if not isinstance(paragraph, str):
        paragraph = str(paragraph)

    sents = paragraph.split('. ')

    cleaned_sents = [remove_special_char(sent) for sent in sents]
    return '. '.join(cleaned_sents)

def process_paragraphs(content_list):
    return [process_paragraph(p) for p in content_list]

def find_matches(text, pattern):
    matches = re.findall(pattern, text)
    return len(matches), matches

# Xuất file tiền xử lý

In [None]:
df = pd.read_csv(f'/content/drive/MyDrive/Data Mining Field-oriented/consolidated/cleaned_articles_training.tsv', sep='\t')
content_list = df['content'].tolist()

cleaned_content_list = process_paragraphs(content_list)
tags_list = df['tags'].tolist()

cleaned_df = pd.DataFrame({'content': cleaned_content_list, 'tags': tags_list})
cleaned_df.to_csv(f'/content/drive/MyDrive/Data Mining Field-oriented/consolidated/cleaned_articles_training.tsv', sep='\t', index=False)