In [1]:
import pandas as pd
import re
from jisho_api.sentence import Sentence
from jisho_api.word import Word
from sudachipy import tokenizer
from sudachipy import dictionary
from wanakana import to_hiragana, is_japanese, is_katakana, is_hiragana, is_kanji
from deep_translator import (ChatGptTranslator, GoogleTranslator)
import string

In [2]:
tokenizer_obj = dictionary.Dictionary(dict_type="small").create()

KANJI_READING_MAPPING = {
    '私': '私[わたし]',
    '貴女': '貴女[あなた]',
    '何': '何[なに]',
    '外宇宙': '外宇宙[がいうちゅう]',
    '異星人': '異星人[いせいじん]',
    '優那': '優那[ゆうな]',
    '菜々美': '菜々美[ななみ]'
}

JAPANESE_PUNCTUATION = '　〜！？。、（）：「」『』０１２３４５６７８９ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ'

SPECIAL_CHARACTERS = '〜'

def is_japanese_extended(text):
    return is_japanese(text) and text not in string.punctuation and text not in JAPANESE_PUNCTUATION

def to_anki_format(index, kanji, reading):
    return '{}{}[{}]'.format(' ' if index > 0 else '', kanji, reading) 

def add_furigana(text):
    tokens = [m for m in tokenizer_obj.tokenize(text, tokenizer.Tokenizer.SplitMode.C)]
    parsed = ''
    token_indexes_to_skip = []
    for index, token in enumerate(tokens):   
        if index in token_indexes_to_skip:
          continue
        to_parse = is_japanese_extended(token.surface()) and not is_katakana(token.surface()) and not is_hiragana(token.surface())
        if to_parse:
            if token.surface()[-1] in SPECIAL_CHARACTERS:
                parsed += add_furigana(token.surface()[:-1]) + token.surface()[-1]
            else:
                if index > 0:
                    parsed += ' '
                reading = to_hiragana(token.reading_form())
                if token.surface() in KANJI_READING_MAPPING:
                    parsed += KANJI_READING_MAPPING[token.surface()]
                elif index < len(tokens)-1 and token.surface() + tokens[index+1].surface() in KANJI_READING_MAPPING:
                    parsed += KANJI_READING_MAPPING[tokens[index].surface() + tokens[index+1].surface()]
                    token_indexes_to_skip.append(index+1)
                else:
                    surface_index = 0
                    reading_index = 0
                    while len(token.surface()) > surface_index:
                        if is_hiragana(token.surface()[surface_index]) or is_katakana(token.surface()[surface_index]):
                            parsed += token.surface()[surface_index]
                            reading_index += 1
                            surface_index += 1
                        else:
                            next_index = -1
                            for token_index in range(surface_index, len(token.surface())):
                                if is_hiragana(token.surface()[token_index]) or is_katakana(token.surface()[token_index]):
                                    next_index = token_index
                                    break
                            if next_index < 0:
                                parsed += to_anki_format(
                                  index=surface_index, 
                                  kanji=token.surface()[surface_index:], reading=reading[reading_index:])
                                break
                            else:
                                reading_index_tail = reading_index
                                while reading[reading_index_tail] != token.surface()[next_index] or (reading_index_tail < len(reading)-1 and reading[reading_index_tail] == reading[reading_index_tail+1]):
                                    reading_index_tail += 1
                                parsed += to_anki_format(
                                  index=surface_index, 
                                  kanji=token.surface()[surface_index:next_index], reading=reading[reading_index:reading_index_tail])
                                reading_index = reading_index_tail
                            reading_length = next_index - surface_index
                            if reading_length > 0:
                                surface_index += reading_length
                            else:
                                break
        else:
            parsed += token.surface()
    return parsed

  tokenizer_obj = dictionary.Dictionary(dict_type="small").create()


In [3]:
def translate(word):
    return GoogleTranslator(source='japanese', target='english').translate(text=word)

In [4]:
def scrape_def(word):
    r = Word.request(word)
    result = ""
    for item in r.data[0].senses[0].english_definitions:
        result += f"{item}; "
    parts_of_speech = r.data[0].senses[0].parts_of_speech
    if len(parts_of_speech) == 1:
        return f"{parts_of_speech[0]}\n" + result[:-2]
    else:
        return f"{parts_of_speech[1]} - {parts_of_speech[0]}\n" + result[:-2]

In [5]:
def define_word(row):
    if pd.isna(row['Definition']):
        return scrape_def(row['Vocab'])
    else:
        return row['Definition']

In [6]:
def get_example(row):
    if pd.isna(row['Sentence']):
        r = Sentence.request(row['Vocab'])
        if r is not None:
            if r.data:  # Check if data is not empty
                for item in r.data:
                    #if row['Vocab'] in item.japanese:
                    sentence = item.japanese
                    no_furigana, furigana = replace_parentheses(sentence)
                    english = item.en_translation
                    return no_furigana, furigana, english
        return None, None, None  # Return None if no match found or no data
    elif pd.isna(row['Sentence_def']):
        return row['Sentence'], add_furigana(row['Sentence']), translate(row['Sentence'])
    else:
        return row['Sentence'], add_furigana(row['Sentence']), row['Sentence_def']

In [7]:
result = Sentence.request('湯気')
if result is not None:
    jisho_sentences = []

    # Accessing metadata
    status_code = result.meta.status

    # Accessing sentence data
    for sentence_config in result.data:
        japanese_sentence = sentence_config.japanese
        english_translation = sentence_config.en_translation

        jisho_sentences.append((japanese_sentence, english_translation))

    # Creating a DataFrame
    test = pd.DataFrame(jisho_sentences, columns=['Japanese Sentence', 'English Translation'])

In [8]:
def replace_parentheses(input_string):
    # Replace all parentheses with brackets
    brackets_replaced = input_string.replace('(', '[').replace(')', ']')

    # Remove parentheses and content within them using regular expressions
    parentheses_removed = re.sub(r'\([^)]*\)', '', input_string)

    return parentheses_removed, brackets_replaced

In [9]:
vocabulary = pd.read_csv('../../data/new_vocab.csv')
vocabulary

Unnamed: 0,Vocab,Definition,Sentence,Sentence_def
0,落とす,,,
1,落ちる,,,
2,出す,,,
3,出る,,,
4,入れる,,,
5,入る,,,
6,開ける,,,
7,開く,,,
8,閉める,,,
9,閉まる,,,


In [10]:
data = []
columns = ['index', 'word', 'word_with_reading', 'definition', 'example_sentence', 'sentence_with_reading', 'sentence_translation', 'Kanji', 'v1', 'word_audio', 'setence_audio']
df = pd.DataFrame(data, columns=columns)

In [11]:
df['index'] = vocabulary['Vocab']
df['word'] = vocabulary['Vocab']
df['word_with_reading'] = vocabulary['Vocab'].apply(add_furigana)
df['definition'] = vocabulary.apply(define_word, axis=1)

# Apply the function to each row of the DataFrame
sentences = vocabulary.apply(get_example, axis=1, result_type='expand')
df['example_sentence'] = sentences[0]
df['sentence_with_reading'] = sentences[1]
df['sentence_translation'] = sentences[2]
# df['sentence_translation'] = vocabulary.apply(lambda row: translate(row['Sentence']) if row['Sentence_def'] == 'None' else row['Sentence_def'], axis=1)
df['Kanji'] = True
df

Unnamed: 0,index,word,word_with_reading,definition,example_sentence,sentence_with_reading,sentence_translation,Kanji,v1,word_audio,setence_audio
0,落とす,落とす,落[お]とす,Transitive verb - Godan verb with 'su' ending\...,一つ精油お湯数滴,一つ[ひと]精油[せいゆ]お湯[おゆ]数滴[すうてき],One way to enjoy aromatherapy is to place a fe...,True,,,
1,落ちる,落ちる,落[お]ちる,Intransitive verb - Ichidan verb\nto fall; to ...,何度無くなった,何度[なんど]無くなった[な],I kept failing the exam time after time and ev...,True,,,
2,出す,出す,出[だ]す,Transitive verb - Godan verb with 'su' ending\...,行き詰まり洗い出す,行き詰まり[いきづ]洗い出す[あらいだ],If you are feeling stuck in your current state...,True,,,
3,出る,出る,出[で]る,Intransitive verb - Ichidan verb\nto leave; to...,昨夜秒速50メートル樹木倒壊する,昨夜[さくや]秒速[びょうそく]50メートル[ごじゅう]樹木[じゅもく]倒壊する[とうかい],"Last night’s ferocious typhoon, with a maximum...",True,,,
4,入れる,入れる,入[い]れる,Transitive verb - Ichidan verb\nto put in; to ...,老い外見,老い[お]外見[がいけん],"As you age, it is natural to develop wrinkles ...",True,,,
5,入る,入る,入[はい]る,Intransitive verb - Godan verb with 'ru' endin...,正門前怖い先輩先輩裏門入った,正門前[せいもんまえ]怖い[こわ]先輩[せんぱい]先輩[せんぱい]裏門[うらもん]入った[はい],There was a scary senior student in front of t...,True,,,
6,開ける,開ける,開[あ]ける,Transitive verb - Ichidan verb\nto open (a doo...,祈る合否開けた,祈る[いの]合否[ごうひ]開けた[あ],I opened the sealed envelope informing me of ...,True,,,
7,開く,開く,開[ひら]く,Intransitive verb - Godan verb with 'ku' endin...,野党各党図る連日会合開いて,野党各党[やとうかくとう]図る[はか]連日[れんじつ]会合[かいごう]開いて[ひら],Opposition political parties are meeting daily...,True,,,
8,閉める,閉める,閉[し]める,Transitive verb - Ichidan verb\nto close; to shut,息子はドアをバタンと閉めて怒りを表わした,息子[むすこ]はドアをバタンと閉めて[し]怒り[いか]を表わした[あらわ],The son demonstrated his anger by shutting the...,True,,,
9,閉まる,閉まる,閉[し]まる,Intransitive verb - Godan verb with 'ru' endin...,窓が閉まっているか確かめとけ,窓[まど]が閉まっている[しま]か確かめ[たし]とけ,See that the window is closed.,True,,,


In [12]:
df.to_csv('../../data/vocab.csv', encoding="utf-8", index=False, header=False)

In [13]:
from jisho_api.word import Word
r = Word.request('仕舞う')

In [14]:
senses = r.data[0].senses
meanings = []
for sense in senses:
    english_definitions = sense.english_definitions
    meanings.append(english_definitions)
    
meanings

[['to finish', 'to stop', 'to end', 'to put an end to', 'to bring to a close'],
 ['to close (a business, etc.)',
  'to close down',
  'to shut down',
  'to shut up'],
 ['to put away', 'to put back', 'to keep', 'to store'],
 ['to do completely', 'to finish'],
 ['to do accidentally', 'to do without meaning to', 'to happen to do']]

In [15]:
x = r.data[0].senses[0].english_definitions
result = ""
for item in x:
    result += f"{item}; "
    
result[:-2]

'to finish; to stop; to end; to put an end to; to bring to a close'

In [16]:
r.data

[WordConfig(slug='仕舞う', is_common=True, tags=[], jlpt=['jlpt-n3', 'jlpt-n1'], japanese=[Japanese(word='仕舞う', reading='しまう'), Japanese(word='終う', reading='しまう'), Japanese(word='了う', reading='しまう'), Japanese(word='蔵う', reading='しまう')], senses=[Sense(english_definitions=['to finish', 'to stop', 'to end', 'to put an end to', 'to bring to a close'], parts_of_speech=["Godan verb with 'u' ending", 'Transitive verb'], links=[], tags=['Usually written using kana alone'], restrictions=[], see_also=[], antonyms=[], source=[], info=[]), Sense(english_definitions=['to close (a business, etc.)', 'to close down', 'to shut down', 'to shut up'], parts_of_speech=["Godan verb with 'u' ending", 'Transitive verb'], links=[], tags=['Usually written using kana alone'], restrictions=[], see_also=[], antonyms=[], source=[], info=[]), Sense(english_definitions=['to put away', 'to put back', 'to keep', 'to store'], parts_of_speech=["Godan verb with 'u' ending", 'Transitive verb'], links=[], tags=['Usually writte

In [17]:
# Assuming r is the return value from Word.request('降りる')
senses = r.data[0].senses[0]

# Extract parts_of_speech for each sense

parts_of_speech = senses.parts_of_speech
if len(parts_of_speech) == 1:
    print(f"{parts_of_speech[0]}")
else:
    print(f"{parts_of_speech[1]} - {parts_of_speech[0]}")

Transitive verb - Godan verb with 'u' ending
