### **1. Import libraries**

In [None]:
import json
import pandas as pd

### **2. Load the data**

In [None]:

with open('/kaggle/input/vietnamese-online-news-dataset/news_dataset.json') as f:
    data = json.load(f)

df = pd.json_normalize(data)

### **3. Inspect the data**

In [None]:
df.info()

### **4. Text cleaning (Content, Title)**

In [None]:
from IPython.display import clear_output

!pip install underthesea
!pip install swifter
!pip install -U ipywidgets
clear_output()

In [None]:
import re
import unicodedata

stop_words = pd.read_csv('/kaggle/input/stopwords/vietnamese-stopwords-dash.txt', header=None)

def clean_text(text):
    # Convert to lowercase
    text = text.lower() 
    
    # Normalize Unicode characters
    text = unicodedata.normalize('NFKC', text)
    
    # Remove numbers
    text = re.sub(r'[^a-zA-ZÀ-ỹ ]+', '', text)
    
    # Tokenize the cleaned sentence
    tokenized_sentence = word_tokenize(text, format='text').split()  # List of words
    cleaned_sentence = ' '.join(word for word in tokenized_sentence if word not in stop_words[0].tolist())

    return cleaned_sentence  # Return list of tokenized sentences
    

# Apply the cleaning function
df['cleaned_content'] = df['content'].apply(clean_text)
df['cleaned_title'] = df['title'].apply(clean_text)

### **5. Word tokenize**

In [None]:
from underthesea import word_tokenize
import swifter
df['tokenized_content'] = df['cleaned_content'].swifter.apply(lambda x: word_tokenize(x))
df['tokenized_title'] = df['cleaned_title'].swifter.apply(lambda x: word_tokenize(x))

### **6. Save preprocessed data**


In [None]:
# save to csv
from pathlib import Path
Path('/kaggle/working/preprocessed_data/').mkdir(parents=True, exist_ok=True)

df.to_csv('/kaggle/working/preprocessed_data/processed_data.csv', index=False)
