# Import Libraries

In [None]:

import pandas as pd
import MeCab 
import sudachipy
import sentencepiece as spm
from sudachipy import tokenizer
from sudachipy import dictionary

In [8]:
# Read input CSV file
input_file = 'F:\\Matching_technologies\\japanese_sentences_sample.csv' 
df_input = pd.read_csv(input_file)


# Initialize Mecab, SudachiPy, SentencePiece

In [7]:
# Initialize MeCab
mecab = MeCab.Tagger()

# Initialize SudachiPy (using default dictionary)
tokenizer_obj = dictionary.Dictionary().create()

# Initialize SentencePiece
sp = spm.SentencePieceProcessor() 

In [24]:
import sentencepiece as spm

# Path to your corpus (a plain text file with sentences)
corpus_file = 'corpus.txt'

# Train the SentencePiece model
spm.SentencePieceTrainer.train(
    f'--input={corpus_file} '  # Path to your text corpus
    f'--model_prefix=mymodel '  # The prefix for the output model files
    f'--vocab_size=240 '  # Size of the vocabulary (you can change this)
    f'--character_coverage=0.9995 '  # The percentage of characters covered by the model
    f'--model_type=bpe '  # Type of model, can be 'bpe', 'unigram', etc.
)


In [35]:
import pandas as pd
import MeCab
import sudachipy
import sentencepiece as spm
from sudachipy import tokenizer
from sudachipy import dictionary

# Load the SentencePiece model (if you have trained one)
sp = spm.SentencePieceProcessor(model_file='mymodel.model')  # Use your trained model

# Initialize MeCab tokenizer
mecab = MeCab.Tagger()

# Initialize SudachiPy tokenizer (using default dictionary)
tokenizer_obj = dictionary.Dictionary().create()

def tokenize_mecab(text):
    # Tokenize with MeCab
    nodes = mecab.parse(text).splitlines()
    tokens = [node.split('\t')[0] for node in nodes if node != 'EOS']
    return tokens

def tokenize_sudachipy(text):
    # Tokenize with SudachiPy
    mode = tokenizer.Tokenizer.SplitMode.C
    tokens = tokenizer_obj.tokenize(text, mode)
    return [token.surface() for token in tokens]

def tokenize_sentencepiece(text):
    # Tokenize with SentencePiece
    tokens = sp.encode(text, out_type=str)
    return tokens

def process_text(text, tokenizer_name):
    # Tokenize and process the text using the appropriate tokenizer
    if tokenizer_name == 'MeCab':
        tokens = tokenize_mecab(text)
    elif tokenizer_name == 'SudachiPy':
        tokens = tokenize_sudachipy(text)
    elif tokenizer_name == 'SentencePiece':
        tokens = tokenize_sentencepiece(text)
    else:
        raise ValueError("Unknown tokenizer name")

    # Calculate total token count and unique token count
    total_token_count = len(tokens)
    unique_token_count = len(set(tokens))

    return tokens, total_token_count, unique_token_count

# Read the CSV file containing Japanese sentences
df = pd.read_csv('japanese_sentences_sample.csv')  # Replace with your actual CSV file
print(df.head())
print(df.columns)
# Prepare a list to collect results
results = []
# Iterate over each row (sentence) in the DataFrame
for index, row in df.iterrows():
    text = row['こんにちは']  

    # MeCab tokenization
    mecab_tokens, mecab_total, mecab_unique = process_text(text, 'MeCab')
    results.append([text, 'MeCab', mecab_tokens, mecab_total, mecab_unique])
    
    # SudachiPy tokenization
    sudachi_tokens, sudachi_total, sudachi_unique = process_text(text, 'SudachiPy')
    results.append([text, 'SudachiPy', sudachi_tokens, sudachi_total, sudachi_unique])

    # SentencePiece tokenization
    sentencepiece_tokens, sentencepiece_total, sentencepiece_unique = process_text(text, 'SentencePiece')
    results.append([text, 'SentencePiece', sentencepiece_tokens, sentencepiece_total, sentencepiece_unique])

# Convert the results into a DataFrame
df_results = pd.DataFrame(results, columns=['input_japanese_txt', 'algorithm', 'tokens', 'total_token_count', 'unique_token_count'])

# Save the results to a new CSV file
df_results.to_csv('tokenization_results.csv', index=False)

print("Tokenization results saved to 'tokenization_results.csv'")


             こんにちは
0          お元気ですか？
1      今日はいい天気ですね。
2       何を食べたいですか？
3  東京に行ったことがありますか？
4         明日は休みです。
Index(['こんにちは'], dtype='object')
Tokenization results saved to 'tokenization_results.csv'
