In [1]:
import pandas as pd
import re
from jisho_api.sentence import Sentence
from sudachipy import tokenizer
from sudachipy import dictionary
from wanakana import to_hiragana, is_japanese, is_katakana, is_hiragana, is_kanji
from deep_translator import (ChatGptTranslator, GoogleTranslator)
import string

In [2]:
tokenizer_obj = dictionary.Dictionary(dict_type="small").create()

KANJI_READING_MAPPING = {
    '私': '私[わたし]',
    '貴女': '貴女[あなた]',
    '何': '何[なに]',
    '外宇宙': '外宇宙[がいうちゅう]',
    '異星人': '異星人[いせいじん]',
    '優那': '優那[ゆうな]',
    '菜々美': '菜々美[ななみ]'
}

JAPANESE_PUNCTUATION = '　〜！？。、（）：「」『』０１２３４５６７８９ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ'

SPECIAL_CHARACTERS = '〜'

def is_japanese_extended(text):
    return is_japanese(text) and text not in string.punctuation and text not in JAPANESE_PUNCTUATION

def to_anki_format(index, kanji, reading):
    return '{}{}[{}]'.format(' ' if index > 0 else '', kanji, reading) 

def add_furigana(text):
    tokens = [m for m in tokenizer_obj.tokenize(text, tokenizer.Tokenizer.SplitMode.C)]
    parsed = ''
    token_indexes_to_skip = []
    for index, token in enumerate(tokens):   
        if index in token_indexes_to_skip:
          continue
        to_parse = is_japanese_extended(token.surface()) and not is_katakana(token.surface()) and not is_hiragana(token.surface())
        if to_parse:
            if token.surface()[-1] in SPECIAL_CHARACTERS:
                parsed += add_furigana(token.surface()[:-1]) + token.surface()[-1]
            else:
                if index > 0:
                    parsed += ' '
                reading = to_hiragana(token.reading_form())
                if token.surface() in KANJI_READING_MAPPING:
                    parsed += KANJI_READING_MAPPING[token.surface()]
                elif index < len(tokens)-1 and token.surface() + tokens[index+1].surface() in KANJI_READING_MAPPING:
                    parsed += KANJI_READING_MAPPING[tokens[index].surface() + tokens[index+1].surface()]
                    token_indexes_to_skip.append(index+1)
                else:
                    surface_index = 0
                    reading_index = 0
                    while len(token.surface()) > surface_index:
                        if is_hiragana(token.surface()[surface_index]) or is_katakana(token.surface()[surface_index]):
                            parsed += token.surface()[surface_index]
                            reading_index += 1
                            surface_index += 1
                        else:
                            next_index = -1
                            for token_index in range(surface_index, len(token.surface())):
                                if is_hiragana(token.surface()[token_index]) or is_katakana(token.surface()[token_index]):
                                    next_index = token_index
                                    break
                            if next_index < 0:
                                parsed += to_anki_format(
                                  index=surface_index, 
                                  kanji=token.surface()[surface_index:], reading=reading[reading_index:])
                                break
                            else:
                                reading_index_tail = reading_index
                                while reading[reading_index_tail] != token.surface()[next_index] or (reading_index_tail < len(reading)-1 and reading[reading_index_tail] == reading[reading_index_tail+1]):
                                    reading_index_tail += 1
                                parsed += to_anki_format(
                                  index=surface_index, 
                                  kanji=token.surface()[surface_index:next_index], reading=reading[reading_index:reading_index_tail])
                                reading_index = reading_index_tail
                            reading_length = next_index - surface_index
                            if reading_length > 0:
                                surface_index += reading_length
                            else:
                                break
        else:
            parsed += token.surface()
    return parsed

  tokenizer_obj = dictionary.Dictionary(dict_type="small").create()


In [3]:
def translate(word):
    return GoogleTranslator(source='japanese', target='english').translate(text=word)

In [4]:
def define_word(row):
    """
    Defines vocab from a single row of the DataFrame.
    
    Parameters:
        row (pd.Series): A single row of the DataFrame.
        
    Returns:
        str: The processed vocab definition.
    """
    if pd.isna(row['Definition']):
        return translate(row['Vocab'])
    else:
        return row['Definition']

In [22]:
def get_example(row):
    if pd.isna(row['Sentence']):
        r = Sentence.request(row['Vocab'])
        if r is not None:
            if r.data:  # Check if data is not empty
                sentence = r.data[0].japanese
                return 
        return None  # Return None if no match found or no data
    else:
        return row['Sentence']

In [24]:
result = Sentence.request('湯気')
if result is not None:
    jisho_sentences = []

    # Accessing metadata
    status_code = result.meta.status

    # Accessing sentence data
    for sentence_config in result.data:
        japanese_sentence = sentence_config.japanese
        english_translation = sentence_config.en_translation

        jisho_sentences.append((japanese_sentence, english_translation))

    # Creating a DataFrame
    test = pd.DataFrame(jisho_sentences, columns=['Japanese Sentence', 'English Translation'])

here


In [30]:
def replace_parentheses(input_string):
    # Replace all parentheses with brackets
    brackets_replaced = input_string.replace('(', '[').replace(')', ']')

    # Remove parentheses and content within them using regular expressions
    parentheses_removed = re.sub(r'\([^)]*\)', '', input_string)

    return brackets_replaced, parentheses_removed

In [45]:
x, y = replace_parentheses(jisho_sentences[0][0])
print(f'{x}\n{y}')

薬缶[やかん]から湯気[ゆげ]が立っている[た]
薬缶から湯気が立っている


In [9]:
vocabulary = pd.read_csv('../../data/new_vocab.csv')

In [10]:
data = []
columns = ['index', 'word', 'word_with_reading', 'definition', 'example_sentence', 'sentence_with_reading', 'sentence_translation', 'Kanji', 'v1', 'word_audio', 'setence_audio']
df = pd.DataFrame(data, columns=columns)

In [23]:
df['index'] = vocabulary['Vocab']
df['word'] = vocabulary['Vocab']
df['word_with_reading'] = vocabulary['Vocab'].apply(add_furigana)
df['definition'] = vocabulary.apply(define_word, axis=1)
df['example_sentence'] = vocabulary.apply(get_example, axis=1)
df['sentence_with_reading'] = vocabulary['Sentence'].apply(add_furigana)
# df['sentence_translation'] = vocabulary.apply(lambda row: translate(row['Sentence']) if row['Sentence_def'] == 'None' else row['Sentence_def'], axis=1)
# df['Kanji'] = True
df

Unnamed: 0,index,word,word_with_reading,definition,example_sentence,sentence_with_reading,sentence_translation,Kanji,v1,word_audio,setence_audio
0,湯気,湯気,湯気[ゆげ],Steam (cooking/kettle/bath/hot spring),薬缶(やかん)から湯気(ゆげ)が立っている(た),,,,,,
1,個体,個体,個体[こたい],Solid,個体発生(こたいはっせい)は系統発生(けいとうはっせい)を繰り返す(くりかえ),,,,,,
2,気体,気体,気体[きたい],Liquid,窒素(ちっそ)大気(たいき)約5分の4(やくごぶんのよん)占める(し)無色無臭(むしょくむしゅう),,,,,,
3,地球,地球,地球[ちきゅう],Earth,近年(きんねん)世界中(せかいじゅう)否めない(いな),,,,,,
4,球,球,球[たま],Ball (basketball/baseball/earth) 玉 is used for...,近年(きんねん)世界中(せかいじゅう)否めない(いな),,,,,,
5,降りる,降りる,降[お]りる,to get off (a bus or plane),草(くさ)に霜(しも)が降りている(おり),,,,,,
6,竹とんぼ,竹とんぼ,竹[たけ]とんぼ,Bamboo dragonfly,,,,,,,
7,虫眼鏡,虫眼鏡,虫[むし] 眼鏡[めがね],Magnifying glass,,,,,,,
8,爪きり,爪きり,爪[つめ]きり,Nail Clipper,,,,,,,
9,削る,削る,削[けず]る,To sharpen (pencil),私(わたし)は鉛筆(えんぴつ)をけずるナイフがほしい,,,,,,


In [None]:
df.to_csv('../../data/vocab.csv', encoding="utf-8", index=False, header=False)