In [97]:
import re
import MeCab
import gensim
import sqlite3
import Levenshtein
import numpy as np
from scipy.stats.mstats import gmean
import functools
from typing import Callable, Iterable, List, Set, Dict, Tuple, Optional

In [2]:
model = gensim.models.FastText.load_fasttext_format("./data/ja_model.bin")

  """Entry point for launching an IPython kernel.


In [3]:
conn = sqlite3.connect("./data/wnjpn.db")

In [151]:
yomi = MeCab.Tagger("-Oyomi")
re_all_katakana = re.compile(r'^[\u30A1-\u30F4]+$')
def get_yomi(text: str) -> str:
    # TODO return None if yumi parse failed
    reading = yomi.parse(text).replace("\n", '')
    matched = re_all_katakana.match(reading)
    if matched:
        return matched[0]
    return None

In [152]:
## returns a list of
## (wordid: int, lemma: str,score: float, occurrence: int)
def get_words_in_all_synsets(
    search: str or List[str],
    threshold: int = 1
) -> List[Tuple[int, str, float, int]]:
    placeholder = "?" if type(search) is str else ",".join([
        "?" for i in range(0, len(search))
    ])
    params = (search,threshold) if type (search) is str else [*search, threshold]
    # print(placeholder, params)
    cur = conn.execute(f"""\
select
 sub.wordid,
 sub.lemma,
 sum(1.0) as score,
 count(sub.lemma) as occurrence
from (
    select
        related_word.wordid,
        related_word.lemma
    from word base
    inner join sense attributed_sense
        on attributed_sense.wordid = base.wordid
    inner join sense all_sense
        on all_sense.synset = attributed_sense.synset
    inner join word related_word
        on related_word.wordid = all_sense.wordid
        and related_word.lang = "jpn"
    where base.lemma in ({placeholder})
) sub
group by sub.lemma, sub.wordid
having count(sub.lemma) >= (?)
order by count(sub.lemma) desc
""", params)
    return list(map(lambda x:(x[0:4]), cur.fetchall()))

# get_words_in_all_synsets("優しい", 1)
# get_words_in_all_synsets(["優しい", "好青年", "朗らか"], 2)

In [0]:
def create_values_query(
    neigbhor_list: Iterable[Tuple[str, float]]
) -> Tuple[str, Dict]:
    params = functools.reduce(
        lambda acc,e: { **acc, f"w{e[0]}": e[1][0], f"s{e[0]}": e[1][1] },
        enumerate(neigbhor_list),
        {}
    )
    query_elems = [f":w{i}, :s{i}" for i in range(0, len(neigbhor_list))]
    return (f"""select {" union select ".join(query_elems)} """, params)

## returns a list of
## (wordid: int, lemma: str,score: float, occurrence: int)
def get_synonym_score_for_neigbhor_list(
    neigbhor_list: Iterable[Tuple[str, float]],
    threshold: int = 1,
    includes_self: bool = False,
) -> List[Tuple[str, int, float, int]]:
    (values_query, params) = create_values_query(neigbhor_list)
    query = f"""\
with neigbhors(word, score) as (
{values_query}
),
sub(word, score, wordid, lemma) as (
select
  n.word,
  n.score,
  rel_w.wordid,
  rel_w.lemma
from neigbhors n
inner join word w
  on w.lemma = n.word
inner join sense s
  on s.wordid = w.wordid
inner join sense rel_s
  on rel_s.synset = s.synset
  {"" if includes_self else "and rel_s.wordid != s.wordid" }
inner join word rel_w
  on rel_w.wordid = rel_s.wordid
  and rel_w.lang = "jpn"
)
select
 sub.wordid,
 sub.lemma,
 sum(sub.score) as score,
 count(sub.lemma) as occurrence
from sub
group by sub.lemma, sub.wordid
having count(sub.lemma) >= :threshold
order by count(sub.lemma) desc\
"""
    cur = conn.execute(query, {**params, "threshold": threshold })
    return list(map(lambda x:(x[0:4]), cur.fetchall()))

# get_synonym_score_for_neigbhor_list([("優しい",  0.1), ("朗らか", 0.01)])

In [153]:
## returns a list of
## (wordid: int, lemma: str, score: float, occurrence: int)
def list_synonyms_from_similar_words(
    search: str or List[str],
    topn: int=20,
    threshold: int = 1,
    includes_self: bool = False,
) -> List[Tuple[str, int, float, int]]:
    neigbhor_list= model.wv.most_similar(
        positive=search,
        topn=topn,
    )
    # similar_words = [*map(lambda x:x[0],neigbhor_list)]
    synonyms = get_synonym_score_for_neigbhor_list(
        neigbhor_list=neigbhor_list,
        threshold=threshold,
        includes_self=includes_self,
    )
    return synonyms

# list_synonyms_from_similar_words(search=["優しい", "かわいい"])[0:30]

In [0]:
def sort_by_score(
    synonym_list: List[str],
    top_n: int = -1,
    order: str = "asc",
) -> List[Dict]:
    result_sorted = sorted(
        synonym_list,
        key=lambda x: x["s"],
        reverse=order=="desc",
    )
    if (top_n > 0 and type(top_n) == int):
        return result_sorted[0:top_n]
    return result_sorted

def get_mixed_synonyms(
    search: str or Iterable[str],
    search_threshold: int=1,
    use_similar: bool=True,
    similar_topn: int=20,
    similar_boost: float=0.2,
    similar_threshold: int=1,
    similar_includes_self: bool = False,
    add_yomi: bool = True,
) -> List[Dict]:
    synoyms_from_search = get_words_in_all_synsets(
        search=search,
        threshold=search_threshold,
    )
    synonyms_from_similar_words = list_synonyms_from_similar_words(
        search=search,
        topn=similar_topn,
        threshold=similar_threshold,
        includes_self=similar_includes_self,
    ) if use_similar else []
    aggregated_dict = functools.reduce(
        lambda acc, x: {
            **acc, x[0]: (
                {
                    "i": x[0], # wordid
                    "w": x[1], # word (lemma)
                    "s": x[2], # score
                    "occ": x[3], # occurrence in search iteself
                    "sim_occ": 0, # occurrence in similar words
                } if x[0] not in acc else {
                    **acc[x[0]],
                    "s": acc[x[0]]["s"] + x[2],
                    "occ": acc[x[0]]["occ"] + x[3],
                }
            )
        },
        synoyms_from_search,
        {}
    )
    if not use_similar:
        return aggregated_dict.values()
    return functools.reduce(
        lambda acc, x: {
            **acc, x[0]: (
                {
                    "i": x[0],
                    "w": x[1],
                    "s": similar_boost*x[2],
                    "occ": 0,
                    "sim_occ": x[3],
                } if x[0] not in acc else {
                    **acc[x[0]],
                    "s": acc[x[0]]["s"] + similar_boost*x[2],
                    "sim_occ": acc[x[0]]["sim_occ"] + x[3],
                }
            )
        },
        synonyms_from_similar_words,
        aggregated_dict,
    ).values()

# sort_by_score(get_mixed_synonyms("綺麗な", use_similar=True), 10)

In [154]:
def get_top_n_percentile(
    synonym_list: List[str],
    percentile: float = 0.95,
) -> List[Dict]:
    sorted_list = sort_by_score(synonym_list, order="desc")
    score_list = [*map(lambda x:x["s"], sorted_list)]
    total_score = sum(score_list)
    until = percentile * total_score
    curr = 0.0
    for i,s in enumerate(score_list):
        curr += s
        if (curr >= until):
            break
    return sorted_list[0:i]

# get_top_n_percentile(get_mixed_synonyms("綺麗な", use_similar=True), percentile=0.95)


In [155]:
def get_similar_yomi_table(
    yomi_list: List[str],
    prefix_weight: float = 0.05,
    threshold: float = 0.90
):
    n = len(yomi_list)
    parent = [-1] * n
    root_word = [None] * n
    sim_links = []
    for i, w_i in enumerate(yomi_list):
        sims = []
        for  j, w_j in enumerate(yomi_list[i+1:], i+1):
            if parent[j] >= 0:
                continue
            dist = Levenshtein.jaro_winkler(w_i, w_j, prefix_weight)
            if dist < threshold:
                continue
            parent[j] = i
            root_word[j] = yomi_list[i] if parent[i] < 0 else root_word[i]
            sims += [(j, w_j, dist)]
        if parent[i] < 0 or len(sims) > 0:
            sim_links += [(i, w_i, parent[i], sims)]
        else:
            sim_links += [None]
    # display(sim_links)
    # similar_yomi_dict = [(yomi_list[i], root_word[i]) for i in range (0, n)]
    translate_dict = dict([
        (yomi_list[i], root_word[i] if root_word[i] is not None else yomi_list[i])
        for i in range (0, n)
    ])
    yomi_table = functools.reduce(
        lambda acc, x: {
            **acc,
            x[1]: [x[0]] if x[1] not in acc else [*acc[x[1]], x[0]]
        }, translate_dict.items(), {})
    return yomi_table, sim_links

#get_similar_yomi_table(["キレイナ", "キレイダ", "キレイ", "キタナイ", "キレイダヨ", "キレイダヨネ", "レイダヨ", "ソケナイ", "キレタ", "ソッケナイ"])

In [164]:
def get_yomi_lookup(synonym_list: List[str]):
    with_yomi = []
    for x in synonym_list:
        yomi = get_yomi(x["w"])
        if yomi is None or len(yomi) < 1:
            continue
        with_yomi += [{**x, "y": yomi }]
    return functools.reduce(lambda acc, x: {
        **acc,
        x["y"]: (acc[x["y"]] if x["y"] in acc else []) + [x]
    }, with_yomi, {})

def get_representative_word(
    search: str or List[str],
    items: Iterable[Dict],
):
    if len(items) < 1:
        return (None, None)
    if len(items) == 1:
        return (items[0], [])
    max_score = max(map(lambda x:x["s"], items))
    ties = [*filter(lambda x:x["s"] >= max_score, items)]
    if len(ties) < 2:
        rep = ties[0]
        return (rep, [*filter(lambda x:x["i"] != rep["i"], items)])
    sim_score = [*map(lambda x: (x, np.mean(model.wv.similarity(search, x["w"]))), ties)]
    top_sim_score = max(map(lambda x:x[1], sim_score))
    score_sorted = sorted(sim_score, key=lambda x: x[1], reverse=True)
    rep = score_sorted[0][0]
    return (rep, [*filter(lambda x:x["i"] != rep["i"], items)])

def aggregate_score(items: Iterable[Dict]):
    scores = [*map(lambda x:x["s"], items)]
    return sum(scores)

def aggregate_by_yomi(
    search: str or List[str],
    synonym_list: List[str],
    top_n: int = 20,
    use_similar_yomi: bool = True,
    remove_self: bool = True,
    jaro_winkler_prefix_weight: float = 0.05,
    jaro_winkler_threshold: float = 0.95,
):
    yomi_lookup = get_yomi_lookup(synonym_list)
    unique_yomi_list = [yomi for yomi in yomi_lookup.keys() if yomi != '']
    if use_similar_yomi:
        yomi_table,_ = get_similar_yomi_table(
            unique_yomi_list,
            prefix_weight = jaro_winkler_prefix_weight,
            threshold = jaro_winkler_threshold,
        )
    else:
        yomi_table = dict([(yomi, yomi) for yomi in unique_yomi_list])
    TODO = None
    yomi_aggregated = []
    for yomi, similar_yomis in yomi_table.items():
        items = sum([yomi_lookup[sim_y] for sim_y in similar_yomis], [])
        (rep, others) = get_representative_word(search, items)
        agg_score = aggregate_score(items)
        yomi_aggregated += [{
            "i": rep["i"],
            "w": rep["w"],
            "y": rep["y"],
            "s": agg_score,
            "c": [*map(lambda x: ({"i": x["i"], "w": x["w"]}), others)]
        }]
    
    if remove_self and type(search) == str:
        self_yomi = get_yomi(search)
        yomi_aggregated = [*filter(lambda x: x["y"] != self_yomi, yomi_aggregated)]
    if remove_self and type(search) == list:
        self_yomis = [get_yomi(s) for s in search]
        yomi_aggregated = [*filter(lambda x: x["y"] not in self_yomis, yomi_aggregated)]

    agg_sorted = sorted(yomi_aggregated, key=lambda x:x["s"], reverse=True)
    if top_n <= 0:
        return agg_sorted
    return agg_sorted[0: top_n]

# tops = get_top_n_percentile(get_mixed_synonyms(["優しい"], use_similar=True), percentile=0.95)
# display(aggregate_by_yomi(["優しい"],tops))

In [176]:
def test_synonyms(search: str, top_n:int = 10):
    tops = get_top_n_percentile(get_mixed_synonyms(search, use_similar=True, similar_boost=0.1), percentile=0.95)
    display(aggregate_by_yomi(search,tops))
test_synonyms("賢い")

[{'i': 233947,
  'w': '聡い',
  'y': 'サトイ',
  's': 5.423588463664054,
  'c': [{'i': 200930, 'w': '敏い'}, {'i': 205736, 'w': 'さとい'}]},
 {'i': 220718, 'w': '利口', 'y': 'リコウ', 's': 5.368125593662262, 'c': []},
 {'i': 159313, 'w': '賢しい', 'y': 'サカシイ', 's': 4.513119414448738, 'c': []},
 {'i': 175664, 'w': '利発', 'y': 'リハツ', 's': 4.331093329191208, 'c': []},
 {'i': 194582, 'w': '賢明', 'y': 'ケンメイ', 's': 4.324983194470406, 'c': []},
 {'i': 155461, 'w': '利巧', 'y': 'リタクミ', 's': 3.5110827028751372, 'c': []},
 {'i': 180732, 'w': '明敏', 'y': 'メイビン', 's': 3.420624279975891, 'c': []},
 {'i': 228131, 'w': '怜悧', 'y': 'レイリ', 's': 3.373590224981308, 'c': []},
 {'i': 240131, 'w': '穎悟', 'y': 'エイゴ', 's': 3.373590224981308, 'c': []},
 {'i': 235106, 'w': '聡明', 'y': 'ソウメイ', 's': 3.371486783027649, 'c': []},
 {'i': 159024, 'w': '英明', 'y': 'エイメイ', 's': 3.2815588265657425, 'c': []},
 {'i': 171371, 'w': '利根', 'y': 'リコン', 's': 2.2790583789348604, 'c': []},
 {'i': 241484, 'w': '明達', 'y': 'メイタツ', 's': 2.2790583789348604, 'c'

In [0]:

display(get_words_in_all_synsets("明るい"))
synonyms = list_synoyms_from_wordnet_dict("やる気")
display(synonyms)
list_neigbhors_from_synonyms(synonyms, topn=20)

In [0]:
def list_neigbhors_from_synonyms(
    synonyms: Iterable[Tuple[str, int]],
    topn: int = 100
):
    multiplied_list = [[word for i in range(0,occurrence)] for (word, occurrence) in synonyms ]
    search_list = [word for word_pack in multiplied_list for word in word_pack]
    # display(search_list)
    return list_neigbhors_for_search_list(search_list, topn=topn)


In [0]:
def append_neigbhorness_lift_for_each_search(
    search_list: Iterable[str],
    word: str,
    neigbhorness: float
) -> List[Tuple[str, float, Tuple[float, float]]]:
    neigbohr_and_lift = []
    for search in search_list:
        similarity = model.wv.similarity(word, search)
        lift =  neigbhorness/similarity if similarity > 0 else 0
        neigbohr_and_lift += [(similarity, lift)]
    return (word, neigbhorness, neigbohr_and_lift)

In [0]:
def filter_neigbours_by_score_func(
    neigbhorness_lift: Iterable[Tuple[str, float, Tuple[float, float]]],
    score_func: Callable[float, Iterable[Tuple[float, float]]]):


SyntaxError: unexpected EOF while parsing (<ipython-input-14-bced9ec0ec98>, line 1)

In [0]:
searches = ["明るい", "性格"]
combi = [
   (word, score, [model.wv.similarity(word, search) for search in searches])
   for (word, score) in model.wv.most_similar(
       positive=searches,
       topn=100
    )
]
coocs = [*filter(None, [
    None if len([*filter(lambda x: x > score, search_scores)]) > 0 else
    (
        word,
        score,
        min([*map(lambda x:score/x, search_scores)]),
        search_scores,
    )
    for (word, score, search_scores) in combi]
)]
display("related")
display([*sorted(coocs, key=lambda x:-x[2])][0:50])