In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-word-segmentation/ws_test.txt
/kaggle/input/nlp-word-segmentation/LST20 Brief Specification.pdf
/kaggle/input/nlp-word-segmentation/ws_list.txt
/kaggle/input/nlp-word-segmentation/LST20 Annotation Guideline.pdf
/kaggle/input/nlp-word-segmentation/ws_sample_submission.csv


In [4]:
pip install pythainlp

Collecting pythainlp
  Downloading pythainlp-5.1.0-py3-none-any.whl.metadata (8.0 kB)
Downloading pythainlp-5.1.0-py3-none-any.whl (19.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pythainlp
Successfully installed pythainlp-5.1.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install attacut

Collecting attacut
  Downloading attacut-1.0.6-py3-none-any.whl.metadata (4.0 kB)
Collecting docopt>=0.6.2 (from attacut)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fire>=0.1.3 (from attacut)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nptyping>=0.2.0 (from attacut)
  Downloading nptyping-2.5.0-py3-none-any.whl.metadata (7.6 kB)
Collecting ssg>=0.0.4 (from attacut)
  Downloading ssg-0.0.8-py3-none-any.whl.metadata (762 bytes)
Collecting python-crfsuite>=0.9.6 (from ssg>=0.0.4->attacut)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading attacut-1.0.6-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.7 MB/s[0

In [8]:
import pandas as pd
from pythainlp.tokenize import word_tokenize
from tqdm import tqdm

def add_word_tags(text):
    words = word_tokenize(text, engine="attacut")
    tagged_tokens = []
    for word in words:
        if word.strip() == "":
            continue
        word_len = len(word)
        for i in range(word_len):
            if i == 0:
                tagged_tokens.append('B_WORD') 
            elif i == word_len - 1:
                tagged_tokens.append('E_WORD')  
            else:
                tagged_tokens.append('I_WORD')  
    return tagged_tokens

def process_and_write_to_csv(input_file, output_file):
    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            text = file.read()
    except Exception as e:
        print(f"เกิดข้อผิดพลาดในการอ่านไฟล์: {e}")
        return None
    
    chars = list(text)
    is_space = [c.isspace() for c in chars]
    
    non_space_chars = []
    original_positions = []
    
    for i, (char, space) in enumerate(zip(chars, is_space)):
        if not space:
            non_space_chars.append(char)
            original_positions.append(i + 1)
            
    non_space_text = ''.join(non_space_chars)
    
    chunks = []
    positions_chunks = []
    chunk_size = 10000
    
    for i in range(0, len(non_space_text), chunk_size):
        end = min(i + chunk_size, len(non_space_text))
        chunks.append(non_space_text[i:end])
        positions_chunks.append(original_positions[i:end])
    
    all_tags = []
    all_positions = []
    
    for chunk_idx, (chunk, positions) in enumerate(zip(chunks, positions_chunks)):
        words = word_tokenize(chunk, engine="attacut")
        
        char_idx = 0
        for word in words:
            if len(word) == 1:
                all_tags.append("B_WORD")
                all_positions.append(positions[char_idx])
                char_idx += 1
            else:
                all_tags.append("B_WORD")
                all_positions.append(positions[char_idx])
                char_idx += 1
                
                for _ in range(len(word) - 2):
                    all_tags.append("I_WORD")
                    all_positions.append(positions[char_idx])
                    char_idx += 1
                
                all_tags.append("E_WORD")
                all_positions.append(positions[char_idx])
                char_idx += 1
    
    if len(all_tags) != len(non_space_chars):
        if len(all_tags) < len(non_space_chars):
            missing = len(non_space_chars) - len(all_tags)
            for i in range(len(all_tags), len(non_space_chars)):
                all_tags.append("B_WORD")
                all_positions.append(original_positions[i])
        else:
            all_tags = all_tags[:len(non_space_chars)]
            all_positions = all_positions[:len(non_space_chars)]
    
    submission_df = pd.DataFrame({
        'Id': all_positions,
        'Predicted': all_tags
    })
    
    submission_df = submission_df.sort_values(by='Id').reset_index(drop=True)
    
    submission_df.to_csv(output_file, index=False)
    
    print(f"บันทึกไฟล์ {output_file} สำเร็จ")
    print(f"จำนวนแถว: {len(submission_df)}")
    
    tag_counts = submission_df['Predicted'].value_counts()
    for tag, count in tag_counts.items():
        print(f"{tag}: {count} ({count/len(submission_df)*100:.2f}%)")
    
    return submission_df

input_file = '/kaggle/input/nlp-word-segmentation/ws_test.txt'
output_file = '/kaggle/working/submissionnn.csv'

process_and_write_to_csv(input_file, output_file)

print('เสร็จ')

บันทึกไฟล์ /kaggle/working/submissionnn.csv สำเร็จ
จำนวนแถว: 35182
I_WORD: 19827 (56.36%)
B_WORD: 7785 (22.13%)
E_WORD: 7570 (21.52%)
เสร็จ
