In [1]:
import warnings
warnings.simplefilter("ignore")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle

In [3]:
from pythainlp.util import normalize
from pythainlp import thai_characters
from pythainlp.corpus import thai_stopwords
from pythainlp.tokenize import word_tokenize

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline # Import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

In [5]:
df = pd.read_csv('prachathai-67k.csv')

In [6]:
benchmark_labels = ['การเมือง','สิทธิมนุษยชน','คุณภาพชีวิต','ต่างประเทศ','สังคม',
                    'สิ่งแวดล้อม','เศรษฐกิจ','วัฒนธรรม','แรงงาน','ความมั่นคง','ไอซีที','การศึกษา']

In [7]:
def expand_list_to_columns(row):
    classes = {c: int(c in row['labels']) for c in benchmark_labels}
    return pd.Series(classes)

In [8]:
new_cols = df.apply(expand_list_to_columns, axis=1)
df = pd.concat([df, new_cols], axis=1)

In [9]:
column_mapping = {
    "การเมือง": "politics", "สิทธิมนุษยชน": "human_rights", "คุณภาพชีวิต": "quality_of_life",
    "ต่างประเทศ": "foreign_affairs", "สังคม": "society", "สิ่งแวดล้อม": "environment",
    "เศรษฐกิจ": "economy", "วัฒนธรรม": "culture", "แรงงาน": "labor",
    "ความมั่นคง": "security", "ไอซีที": "ict", "การศึกษา": "education"
}

In [10]:
df = df.rename(columns=column_mapping)

In [11]:
df['full_body_text'] = df['title'] + df['body_text']
target_col = list(column_mapping.values())
df = df[['full_body_text'] + target_col]

In [12]:
URL_PATTERN = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
SPECIAL_CHARS_PATTERN = r'[!@#$%^&*()\-+=\[\]{};:\'",<.>/?\\|]\n'
WORDS_TO_REMOVE = ['ประชาไท', 'ฯ', '๑', '๒', '๓', '๔', '๕', '๖', '๗', '๘', '๙', '๐']
THAI_STOPWORDS = thai_stopwords()
THAI_CHARACTERS_ONLY = "".join(list(thai_characters))

In [13]:
def custom_thai_tokenizer(text):

    # 1. Normalize and clean the text
    text = normalize(text)
    text = re.sub(r'<[^>]+>', '', text) # Remove HTML
    text = re.sub(URL_PATTERN, '', text) # Remove URLs
    for word in WORDS_TO_REMOVE:
        text = text.replace(word, '')
    text = re.sub(SPECIAL_CHARS_PATTERN, '', text) # Remove special chars
    
    # 2. Keep only Thai characters and spaces
    allowed_chars_pattern = f'[^{THAI_CHARACTERS_ONLY}\s]'
    text = re.sub(allowed_chars_pattern, '', text)
    
    # 3. Tokenize text
    tokens = word_tokenize(text, keep_whitespace=False)
    
    # 4. Remove stopwords
    filtered_tokens = [token for token in tokens if token not in THAI_STOPWORDS and token.strip() != '']
    
    return filtered_tokens

In [14]:
X = df['full_body_text']
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
pipelines = {}
metrics = {}

In [16]:
for column in target_col:
    print(f"--- Training model for: {column} ---")
    
    # Define the pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            tokenizer=custom_thai_tokenizer,
            max_features=200,
            token_pattern=None # Let the custom tokenizer handle everything
        )),
        ('clf', LogisticRegression(
            solver='lbfgs',
            max_iter=5000
        ))
    ])
    
    # Fit the pipeline on the training data for the current label
    pipeline.fit(X_train, y_train[column])
    
    # Store the entire fitted pipeline
    pipelines[column] = pipeline
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    
    # Evaluation
    accuracy = accuracy_score(y_test[column], y_pred)
    precision = precision_score(y_test[column], y_pred)
    recall = recall_score(y_test[column], y_pred)
    aucroc = roc_auc_score(y_test[column], y_proba)
    
    metrics[column] = {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'aucroc': aucroc}

--- Training model for: politics ---
--- Training model for: human_rights ---
--- Training model for: quality_of_life ---
--- Training model for: foreign_affairs ---
--- Training model for: society ---
--- Training model for: environment ---
--- Training model for: economy ---
--- Training model for: culture ---
--- Training model for: labor ---
--- Training model for: security ---
--- Training model for: ict ---
--- Training model for: education ---


In [17]:
metrics_df = pd.DataFrame.from_dict(metrics, orient='index')
print("\nModel Performance Metrics:")
print(metrics_df)


Model Performance Metrics:
                 accuracy  precision    recall    aucroc
politics         0.729268   0.767987  0.764870  0.799957
human_rights     0.826779   0.702520  0.379608  0.821566
quality_of_life  0.852703   0.651456  0.290225  0.839945
foreign_affairs  0.912137   0.717993  0.489098  0.914446
society          0.868537   0.562963  0.042175  0.761955
environment      0.894314   0.583463  0.242542  0.865515
economy          0.927235   0.620525  0.238751  0.889854
culture          0.936294   0.628906  0.172932  0.870061
labor            0.970393   0.808581  0.631443  0.961830
security         0.948078   0.642487  0.163158  0.852279
ict              0.962439   0.695402  0.209343  0.911095
education        0.963028   0.593583  0.206704  0.900011


In [19]:
for column, pipeline in pipelines.items():
    print(f"Saving pipeline for: {column}")
    filepath = f'./BE/MODEL/{column}_pipeline.pickle'
    with open(filepath, 'wb') as file:
        pickle.dump(pipeline, file)

Saving pipeline for: politics
Saving pipeline for: human_rights
Saving pipeline for: quality_of_life
Saving pipeline for: foreign_affairs
Saving pipeline for: society
Saving pipeline for: environment
Saving pipeline for: economy
Saving pipeline for: culture
Saving pipeline for: labor
Saving pipeline for: security
Saving pipeline for: ict
Saving pipeline for: education
