In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import pandas as pd

# ฟังก์ชันสำหรับอ่านไฟล์จากโฟลเดอร์ตามลำดับ
def read_data_from_folder(folder_path):
    data = []
    # เรียงลำดับชื่อไฟล์ตามลำดับตัวเลข
    file_names = sorted(os.listdir(folder_path))
    for file_name in file_names:
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, "r", encoding="utf-8") as file:
                for line in file:
                    line = line.strip()
                    if line:  # ข้ามแถวว่าง
                        parts = line.split("\t")
                        if len(parts) == 4:  # ถ้ามี 4 คอลัมน์
                            data.append(parts)
                        elif len(parts) == 3:  # ถ้ามี 3 คอลัมน์ เติมค่า default สำหรับ `tag`
                            parts.insert(2, "O")  # ใส่ค่า "O" ที่ตำแหน่ง index 2
                            data.append(parts)
                        else:
                            print(f"Invalid line in {file_name}: {line}")
    return data

# ฟังก์ชันสำหรับรวบรวมและบันทึกข้อมูล
def process_and_save_data(input_folder, output_file):
    data = read_data_from_folder(input_folder)
    df = pd.DataFrame(data, columns=["word", "pos", "tag", "class"])
    df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"Saved processed data to {output_file}")

# โฟลเดอร์ข้อมูล (แก้ไขให้ตรงกับโครงสร้างใน Kaggle)
train_folder = "/kaggle/input/super-ai-ss-5-named-entity-recognition/train/train"
test_folder = "/kaggle/input/super-ai-ss-5-named-entity-recognition/test/test"
eval_folder = "/kaggle/input/super-ai-ss-5-named-entity-recognition/eval/eval"

# เซฟข้อมูลเป็นไฟล์ CSV
process_and_save_data(train_folder, "train_data.csv")
process_and_save_data(eval_folder, "eval_data.csv")
process_and_save_data(test_folder, "test_data.csv")


In [None]:
%pip install simpletransformers

In [None]:
import pandas as pd

# โหลดข้อมูล
train_data = pd.read_csv('train_data.csv')
eval_data = pd.read_csv('eval_data.csv')
test_data = pd.read_csv('test_data.csv')

# ตรวจสอบตัวอย่างข้อมูล
print(train_data.head())
print(eval_data.head())
print(test_data.head())

In [None]:
tag_list = [
    ('O', 0),
    ('B_ORG', 1),  ('B_PER', 2),  ('B_LOC', 3),  ('B_MEA', 4),
    ('I_DTM', 5),  ('I_ORG', 6),  ('E_ORG', 7),  ('I_PER', 8),
    ('B_TTL', 9),  ('E_PER', 10), ('B_DES', 11), ('E_LOC', 12),
    ('B_DTM', 13), ('B_NUM', 14), ('I_MEA', 15), ('E_DTM', 16),
    ('E_MEA', 17), ('I_LOC', 18), ('I_DES', 19), ('E_DES', 20),
    ('I_NUM', 21), ('E_NUM', 22), ('B_TRM', 23), ('B_BRN', 24),
    ('I_TRM', 25), ('E_TRM', 26), ('I_TTL', 27), ('I_BRN', 28),
    ('E_BRN', 29), ('E_TTL', 30), ('B_NAME', 31)
]
tag_to_id = dict(tag_list)
id_to_tag = {v: k for k, v in tag_to_id.items()}

def get_tag_id(tag):
    # Map unknown tags to 0 (O)
    return tag_to_id.get(tag, 0)

In [None]:
import pandas as pd

def group_sentences_with_id(data, is_test=False):
    sentences = []
    sentence = []
    sentence_id = 0

    for idx, row in data.iterrows():
        word, tag, cls = row['word'], row['tag'], row['class']

        if idx >= 1300000 and not is_test:  # Remove this line
            break
        
        if is_test:
            if cls == 'B_CLS':
                if sentence:  
                    sentences.append({'sentence_id': sentence_id, 'words': sentence})
                    sentence_id += 1
                sentence = [(word, tag)]
            elif cls == 'I_CLS':
                sentence.append((word, tag))

            elif cls == 'E_CLS':
                sentence.append((word, tag))
                sentences.append({'sentence_id': sentence_id, 'words': sentence})
                sentence = []
                sentence_id += 1
        else:
            if tag not in tag_to_id:
                continue

            if cls == 'B_CLS':
                if sentence:
                    sentences.append({'sentence_id': sentence_id, 'words': sentence})
                    sentence_id += 1
                sentence = [(word, tag)]
            elif cls == 'I_CLS':
                sentence.append((word, tag))
            elif cls == 'E_CLS':
                sentence.append((word, tag))
                sentences.append({'sentence_id': sentence_id, 'words': sentence})
                sentence = []
                sentence_id += 1

    if sentence:
        sentences.append({'sentence_id': sentence_id, 'words': sentence})

    return sentences

def create_dataframe(sentences):
    data = []

    for sentence in sentences:
        sentence_id = sentence['sentence_id']
        tokens = [word for word, tag in sentence['words']]
        ner_tags = [tag for word, tag in sentence['words']]

        data.append({'id': sentence_id, 'tokens': tokens, 'ner_tags': ner_tags})

    return pd.DataFrame(data)

train_sentences = group_sentences_with_id(train_data)
eval_sentences = group_sentences_with_id(eval_data)
test_sentences = group_sentences_with_id(test_data, is_test=True)

train_df = create_dataframe(train_sentences)
eval_df = create_dataframe(eval_sentences)
test_df = create_dataframe(test_sentences)

print(train_df.head())

In [None]:
def convert_data_to_df(df):
  data_df = pd.DataFrame()
  sentence_id = []
  words = []
  labels = []

  for sentence in range(len(df)):
    for token in range(len(df['tokens'][sentence])):
      sentence_id.append(sentence)
      words.append(df['tokens'][sentence][token])
      labels.append(df['ner_tags'][sentence][token])
  return pd.DataFrame(
      {"sentence_id": sentence_id, "words": words, "labels": labels}
  )

In [None]:
eval_data = convert_data_to_df(eval_df)
train_data = convert_data_to_df(train_df)

In [None]:
from simpletransformers.ner import NERModel
import pandas as pd

In [None]:
model_args = {
    'num_train_epochs': 2,
    'learning_rate': 1e-4,
    'max_seq_length': 128,
    'train_batch_size': 32,
    'eval_batch_size': 32,
    'overwrite_output_dir': True,
    'save_steps': -1,
    'save_model_every_epoch': False
}
model = NERModel(
    'bert',
    'bert-base-multilingual-cased',
    labels=list(tag_to_id.keys()),
    args=model_args
)

In [None]:
train_data

In [None]:
model.train_model(train_data, eval_data=eval_data)

In [None]:
test_data

In [None]:
def split_into_sentences(tokens, tokens_per_sentence=24):
    sentences = []
    for i in range(0, len(tokens), tokens_per_sentence):
        sentence = tokens[i:i + tokens_per_sentence]
        sentences.append(sentence)
    return sentences

In [None]:
def cat_string(df):
  text = []
  for i in df['word']:
    text.append(str(i))
  return text

test_txt_list = cat_string(test_data)

In [None]:
test_df_prepared = split_into_sentences(test_txt_list)

In [None]:
def format_test_data(test_data):
    formatted_data = [" ".join(sentence) for sentence in test_data]
    return formatted_data

In [None]:
predictions, raw_outputs = model.predict(test_df_prepared,False)

In [None]:
predictions[0][0:10]

In [None]:
final_test_df = []
for i in range(len(predictions)):
  for j in range(len(predictions[i])):
    data = predictions[i][j]
    value = data.values()
    final_test_df += value

In [None]:
ner_list = pd.read_csv("/kaggle/input/super-ai-ss-5-named-entity-recognition/tag_list.csv")
ner_list.head()

In [None]:
Final_NER = []
count = 0
for tags in final_test_df:
  count = 0
  for i in ner_list["tag"]:
    if tags == i:
      Final_NER.append(str(ner_list["class"][count]))
    count += 1
print(Final_NER[0:10])

In [None]:
final_result = pd.DataFrame(Final_NER)
final_result

In [None]:
submisstion_df = pd.read_csv('/kaggle/input/super-ai-ss-5-named-entity-recognition/sample_submission.csv')
submisstion_df['ne'] = final_result
submisstion_df.head(15)

In [None]:
submisstion_df[['id', 'ne']].to_csv('submissionfinal3.csv', index = False)