In [12]:
import MeCab
import gensim
import sqlite3
import Levenshtein
from scipy.stats.mstats import gmean
import functools
from typing import Callable, Iterable, List, Set, Dict, Tuple, Optional

In [13]:
model = gensim.models.FastText.load_fasttext_format("./data/ja_model.bin")

  """Entry point for launching an IPython kernel.


In [14]:
conn = sqlite3.connect("./data/wnjpn.db")

In [15]:
yomi = MeCab.Tagger("-Oyomi")
re_katakana = re.compile(r'[\u30A1-\u30F4]+')
def get_yomi(text: str) -> str:
    reading = yomi.parse(text)
    kana_readings = re_katakana.findall(reading)
    return "".join(kana_readings)

In [41]:
def get_words_in_all_synsets(
    search: str or Iterable[str], threshold: int = 1
) -> List[Tuple[str, int, float, int]]:
    placeholder = "?" if type(search) is str else ",".join([
        "?" for i in range(0, len(search))
    ])
    params = (search,threshold) if type (search) is str else [*search, threshold]
    # print(placeholder, params)
    cur = conn.execute(f"""\
select
 sub.lemma,
 sub.wordid,
 sum(1.0) as score,
 count(sub.lemma) as occurrence
from (
    select
        related_word.wordid,
        related_word.lemma
    from word base
    inner join sense attributed_sense
        on attributed_sense.wordid = base.wordid
    inner join sense all_sense
        on all_sense.synset = attributed_sense.synset
    inner join word related_word
        on related_word.wordid = all_sense.wordid
        and related_word.lang = "jpn"
    where base.lemma in ({placeholder})
) sub
group by sub.lemma, sub.wordid
having count(sub.lemma) >= (?)
order by count(sub.lemma) desc
""", params)
    return list(map(lambda x:(x[0:4]), cur.fetchall()))

# get_words_in_all_synsets("優しい", 1)
# get_words_in_all_synsets(["優しい", "好青年", "朗らか"], 2)

[('優しい', 202105, 10.0, 10),
 ('暖か', 224808, 4.0, 4),
 ('温か', 180094, 4.0, 4),
 ('温かい', 230123, 4.0, 4),
 ('暖かい', 199016, 3.0, 3),
 ('親切', 181880, 3.0, 3),
 ('ソフト', 236122, 2.0, 2),
 ('懇ろ', 180938, 2.0, 2),
 ('懇篤', 179535, 2.0, 2),
 ('柔か', 159857, 2.0, 2),
 ('柔かい', 194787, 2.0, 2),
 ('柔らか', 215954, 2.0, 2),
 ('温和', 223289, 2.0, 2),
 ('篤い', 236818, 2.0, 2),
 ('親身', 193611, 2.0, 2),
 ('軟か', 214659, 2.0, 2),
 ('軟かい', 230984, 2.0, 2),
 ('軟らか', 237902, 2.0, 2),
 ('軟らかい', 195414, 2.0, 2),
 ('静やか', 190765, 2.0, 2),
 ('しなやか', 188485, 1.0, 1),
 ('たわやか', 221855, 1.0, 1),
 ('ねんごろ', 206005, 1.0, 1),
 ('もの柔か', 205072, 1.0, 1),
 ('もの柔らか', 206297, 1.0, 1),
 ('もの静か', 221200, 1.0, 1),
 ('やさしい', 238402, 1.0, 1),
 ('エレガンス', 200537, 1.0, 1),
 ('エレガント', 230934, 1.0, 1),
 ('ソフィスティケート', 184068, 1.0, 1),
 ('マイルド', 248428, 1.0, 1),
 ('上品', 235597, 1.0, 1),
 ('世話好き', 176471, 1.0, 1),
 ('人なつこい', 164339, 1.0, 1),
 ('人懐こい', 165333, 1.0, 1),
 ('人懐っこい', 157736, 1.0, 1),
 ('優婉', 183072, 1.0, 1),
 ('優美', 213541, 1.0, 1

In [36]:
def create_values_query(neigbhor_list: Iterable[Tuple[str, float]]):
    params = functools.reduce(
        lambda acc,e: { **acc, f"w{e[0]}": e[1][0], f"s{e[0]}": e[1][1] },
        enumerate(neigbhor_list),
        {}
    )
    query_elems = [f":w{i}, :s{i}" for i in range(0, len(neigbhor_list))]
    return (f"""select {" union select ".join(query_elems)} """, params)

def get_synonym_score_for_neigbhor_list(
    neigbhor_list: Iterable[Tuple[str, float]],
    threshold: int = 1,
    includes_self: bool = True,
) -> List[Tuple[str, int, float, int]]:
    (values_query, params) = create_values_query(neigbhor_list)
    query = f"""\
with neigbhors(word, score) as (
{values_query}
),
sub(word, score, wordid, lemma) as (
select
  n.word,
  n.score,
  rel_w.wordid,
  rel_w.lemma
from neigbhors n
inner join word w
  on w.lemma = n.word
inner join sense s
  on s.wordid = w.wordid
inner join sense rel_s
  on rel_s.synset = s.synset
  {"" if includes_self else "and rel_s.wordid != s.wordid" }
inner join word rel_w
  on rel_w.wordid = rel_s.wordid
  and rel_w.lang = "jpn"
)
select
 sub.lemma,
 sub.wordid,
 sum(sub.score) as score,
 count(sub.lemma) as occurrence
from sub
group by sub.lemma, sub.wordid
having count(sub.lemma) >= :threshold
order by count(sub.lemma) desc\
"""
    cur = conn.execute(query, {**params, "threshold": threshold })
    return list(map(lambda x:(x[0:4]), cur.fetchall()))

# get_synonym_score_for_neigbhor_list([("優しい",  0.1), ("朗らか", 0.01)])

In [38]:
def list_synonyms_from_similar_words(
    search: str or Iterable[str],
    topn: int=20,
    threshold: int = 1,
    includes_self: bool = False,
) -> List[Tuple[str, float, int]]:
    neigbhor_list= model.wv.most_similar(
        positive=search,
        topn=topn,
    )
    # display(neigbhor_list)
    # similar_words = [*map(lambda x:x[0],neigbhor_list)]
    synonyms = get_synonym_score_for_neigbhor_list(
        neigbhor_list=neigbhor_list,
        threshold=threshold,
        includes_self=includes_self,
    )
    return synonyms

# list_synonyms_from_similar_words(search="朗らか")[0:30]

In [33]:
def get_mixed_synonyms(
    search: str or Iterable[str],
    search_threshold: int=1,
    similar_topn: int=20,
    similar_boost: float=0.2,
    similar_threshold: int=1,
    similar_includes_self: bool = True,
    result_topn: int=50,
) -> List[Tuple[str, float]]:
    synoyms_from_search = get_words_in_all_synsets(
        search=search,
        threshold=search_threshold,
    )
    synonyms_from_similar_words = list_synonyms_from_similar_words(
        search=search,
        topn=similar_topn,
        threshold=similar_threshold,
        includes_self=similar_includes_self,
    )
    modified_score_from_similar_words = [
        (word, wordid, similar_boost*score)
        for (word, wordid, score, _) in synonyms_from_similar_words
    ]
    score_dict = functools.reduce(
        lambda acc, x: {**acc, x[0]: (x[1] if x[0] not in acc else acc[x[0]]+ x[1]) },
        synoyms_from_search + modified_score_from_similar_words,
        {}
    )
    result_sorted = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    return result_sorted[0:result_topn]
    
# TODO: 同じ読み or ほぼ同じ読み　の単語を集約する
get_mixed_synonyms("キュート", result_topn=10)

[('かわいい', 1.1521143794059754),
 ('可愛い', 1.0482676148414611),
 ('おしゃれ', 0.9552717208862305),
 ('愛らしい', 0.9444208502769471),
 ('シック', 0.7463299155235291),
 ('スタイリッシュ', 0.7463299155235291),
 ('小意気', 0.7463299155235291),
 ('小粋', 0.7463299155235291),
 ('瀟洒', 0.7463299155235291),
 ('御洒落', 0.7451115012168885)]

In [40]:
def get_mixed_synonyms(
    search: str or Iterable[str],
    search_threshold: int=1,
    similar_topn: int=20,
    similar_boost: float=0.2,
    similar_threshold: int=1,
    similar_includes_self: bool = True,
) -> List[Tuple[str, float]]:
    synoyms_from_search = get_words_in_all_synsets(
        search=search,
        threshold=search_threshold,
    )
    synonyms_from_similar_words = list_synonyms_from_similar_words(
        search=search,
        topn=similar_topn,
        threshold=similar_threshold,
        includes_self=similar_includes_self,
    )
    modified_score_from_similar_words = [
        (word, similar_boost*score) for (word, score, _) in synonyms_from_similar_words
    ]
    score_dict = functools.reduce(
        lambda acc, x: {**acc, x[0]: (x[1] if x[0] not in acc else acc[x[0]]+ x[1]) },
        synoyms_from_search + modified_score_from_similar_words,
        {}
    )
    return sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    
# TODO: 同じ読み or ほぼ同じ読み　の単語を集約する
get_mixed_synonyms("人柄")

0.64528024

In [0]:

display(get_words_in_all_synsets("明るい"))
synonyms = list_synoyms_from_wordnet_dict("やる気")
display(synonyms)
list_neigbhors_from_synonyms(synonyms, topn=20)

In [0]:
def list_neigbhors_from_synonyms(
    synonyms: Iterable[Tuple[str, int]],
    topn: int = 100
):
    multiplied_list = [[word for i in range(0,occurrence)] for (word, occurrence) in synonyms ]
    search_list = [word for word_pack in multiplied_list for word in word_pack]
    # display(search_list)
    return list_neigbhors_for_search_list(search_list, topn=topn)


In [0]:
def append_neigbhorness_lift_for_each_search(
    search_list: Iterable[str],
    word: str,
    neigbhorness: float
) -> List[Tuple[str, float, Tuple[float, float]]]:
    neigbohr_and_lift = []
    for search in search_list:
        similarity = model.wv.similarity(word, search)
        lift =  neigbhorness/similarity if similarity > 0 else 0
        neigbohr_and_lift += [(similarity, lift)]
    return (word, neigbhorness, neigbohr_and_lift)

In [0]:
def filter_neigbours_by_score_func(
    neigbhorness_lift: Iterable[Tuple[str, float, Tuple[float, float]]],
    score_func: Callable[float, Iterable[Tuple[float, float]]]):


SyntaxError: unexpected EOF while parsing (<ipython-input-14-bced9ec0ec98>, line 1)

In [0]:
searches = ["明るい", "性格"]
combi = [
   (word, score, [model.wv.similarity(word, search) for search in searches])
   for (word, score) in model.wv.most_similar(
       positive=searches,
       topn=100
    )
]
coocs = [*filter(None, [
    None if len([*filter(lambda x: x > score, search_scores)]) > 0 else
    (
        word,
        score,
        min([*map(lambda x:score/x, search_scores)]),
        search_scores,
    )
    for (word, score, search_scores) in combi]
)]
display("related")
display([*sorted(coocs, key=lambda x:-x[2])][0:50])