In [126]:
import gensim
import sqlite3
import Levenshtein
from scipy.stats.mstats import gmean
import functools
from typing import Callable, Iterable, List, Set, Dict, Tuple, Optional

In [26]:
model = gensim.models.FastText.load_fasttext_format("./data/ja_model.bin")

  """Entry point for launching an IPython kernel.


In [27]:
conn = sqlite3.connect("./data/wnjpn.db")

In [62]:
def list_neigbhors_for_search(
  search: str,
  topn: int=100,
  min_levenstein_dist: int = 2
) -> List[Tuple[str, float]]:
  return [
    (word, neigbhorness)
    for (word, neigbhorness) in model.wv.most_similar(
        positive=search,
        topn=topn
      )
    if Levenshtein.distance(word, search) >= min_levenstein_dist
  ]

# list_neigbhors_for_search("優しい", 20)

In [None]:
def list_neigbhors_for_search_list(
  search_list: Iterable[str],
  topn: int=100
) -> List[Tuple[str, float]]:
  return [
    (word, neigbhorness) 
    for (word, neigbhorness) in model.wv.most_similar(
        positive=search_list,
        topn=topn
      )
  ]

In [121]:
# TODO:　where in でまとめない
def get_words_in_all_synsets(
    search: str or Iterable[str], threshold: int = 1
) -> List[Tuple[str, int]]:
    placeholder = "?" if type(search) is str else ",".join([
        "?" for i in range(0, len(search))
    ])
    params = (search,threshold) if type (search) is str else [*search, threshold]
    # print(placeholder, params)
    cur = conn.execute(f"""\
select
 sub.lemma,
 count(sub.lemma) as c
from (
    select
        related_word.wordid,
        related_word.lemma
    from word base
    inner join sense attributed_sense
        on attributed_sense.wordid = base.wordid
    inner join sense all_sense
        on all_sense.synset = attributed_sense.synset
    inner join word related_word
        on related_word.wordid = all_sense.wordid
        and related_word.lang = "jpn"
    where base.lemma in ({placeholder})
) sub
group by sub.lemma
having count(sub.lemma) >= (?)
order by count(sub.lemma) desc
""", params)
    return list(map(lambda x:(x[0], x[1]), cur.fetchall()))

# get_words_in_all_synsets("優しい", 1)
# get_words_in_all_synsets(["優しい", "好青年", "朗らか"], 2)

In [122]:
def list_synonyms_from_similar_words(
    search: str,
    topn: int=20,
    min_levenstein_dist: int = 2,
    threshold: int = 2,
) -> List[Tuple[str, int]]:  
    neigbhor_list = list_neigbhors_for_search(
        search,
        topn=topn,
        min_levenstein_dist=min_levenstein_dist
    )
    similar_words = [*map(lambda x:x[0],neigbhor_list)]
    synonyms = get_words_in_all_synsets(
        search=similar_words,
        threshold=threshold,
    )
    return synonyms

# list_synonyms_from_similar_words("穏やか")

In [166]:
def get_mixed_synonyms(
    search: str,
    search_threshold: int=1,
    similar_topn: int=20,
    similar_min_levenstein_dist: int=2,
    similar_boost: float=0.2,
    similar_threshold: int=1,
) -> List[Tuple[str, float]]:
    synoyms_from_search = get_words_in_all_synsets(
        search=search,
        threshold=search_threshold,
    )
    synonyms_from_similar_words = list_synonyms_from_similar_words(
        search=search,
        topn=similar_topn,
        min_levenstein_dist=similar_min_levenstein_dist,
        threshold=similar_threshold,
    )
    modified_score_from_similar_words = [
        (word, similar_boost*occurrence) for (word, occurrence) in synonyms_from_similar_words
    ]
    score_dict = functools.reduce(
        lambda acc, x: {**acc, x[0]: (x[1] if x[0] not in acc else acc[x[0]]+ x[1]) },
        synoyms_from_search + modified_score_from_similar_words,
        {}
    )
    return sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    #synonyms_from_similar_words + modified_score_from_similar_words
    #all_words=[*map(lambda x:x[0], synoyms_from_search)]+[*map(lambda x:x[0], synoyms_from_search)]
    #scored = dict(all

get_mixed_synonyms("素直")

[('素直', 6.2),
 ('真面目', 3.4000000000000004),
 ('まじめ', 3.0),
 ('正直', 2.8),
 ('フランク', 2.2),
 ('率直', 2.2),
 ('優しい', 2.2),
 ('従順', 2),
 ('真正直', 2.0),
 ('不器用', 1.8),
 ('無邪気', 1.8),
 ('真すぐ', 1.6),
 ('真っすぐ', 1.6),
 ('真っ直ぐ', 1.6),
 ('真直ぐ', 1.6),
 ('天真爛漫', 1.6),
 ('まとも', 1.4),
 ('公明正大', 1.4),
 ('直', 1.4),
 ('真率', 1.4),
 ('真直', 1.4),
 ('素朴', 1.4),
 ('無器用', 1.2000000000000002),
 ('ざっくばらん', 1.2),
 ('まっ正直', 1.2),
 ('ストレート', 1.2),
 ('卒直', 1.2),
 ('単刀直入', 1.2),
 ('堂々', 1.2),
 ('堂堂', 1.2),
 ('明るい', 1.2),
 ('直ぐ', 1.2),
 ('直截', 1.2),
 ('真っ正直', 1.2),
 ('真面', 1.2),
 ('簡明直截', 1.2),
 ('質実', 1.2),
 ('プレイン', 1),
 ('プレーン', 1),
 ('地味', 1),
 ('簡素', 1),
 ('粗朴', 1),
 ('粗樸', 1),
 ('質朴', 1),
 ('質樸', 1),
 ('質素', 1),
 ('飾り気のない', 1),
 ('飾り気の無い', 1),
 ('暖か', 1.0),
 ('本気', 1.0),
 ('気持ち', 1.0),
 ('温か', 1.0),
 ('温かい', 1.0),
 ('イノセント', 0.8),
 ('優しく', 0.8),
 ('天衣無縫', 0.8),
 ('意地っ張り', 0.8),
 ('意地張り', 0.8),
 ('懇ろ', 0.8),
 ('暖かい', 0.8),
 ('生真面目', 0.8),
 ('真剣', 0.8),
 ('親切', 0.8),
 ('誠実', 0.8),
 ('きかぬ気', 0.6000000000000001),
 ('ね

In [109]:

display(get_words_in_all_synsets("明るい"))
synonyms = list_synoyms_from_wordnet_dict("やる気")
display(synonyms)
list_neigbhors_from_synonyms(synonyms, topn=20)

[('明るい', 15),
 ('明い', 5),
 ('明朗', 2),
 ('朗ら', 2),
 ('朗らか', 2),
 ('陽気', 2),
 ('うれしい', 1),
 ('ご機嫌', 1),
 ('ざっくばらん', 1),
 ('ぴかぴか', 1),
 ('まじめ', 1),
 ('まっ正直', 1),
 ('まとも', 1),
 ('ストレート', 1),
 ('ハッピー', 1),
 ('フランク', 1),
 ('仕合せ', 1),
 ('仕合わせ', 1),
 ('元気', 1),
 ('公明正大', 1),
 ('卒直', 1),
 ('単刀直入', 1),
 ('喜ばしい', 1),
 ('堂々', 1),
 ('堂堂', 1),
 ('多望', 1),
 ('大喜び', 1),
 ('奕々たる', 1),
 ('奕奕たる', 1),
 ('嬉々たる', 1),
 ('嬉しい', 1),
 ('希望的', 1),
 ('幸せ', 1),
 ('幸福', 1),
 ('御機嫌', 1),
 ('心うれしい', 1),
 ('心嬉しい', 1),
 ('快活', 1),
 ('悦ばしい', 1),
 ('愉しげ', 1),
 ('明か', 1),
 ('明らか', 1),
 ('晴々しい', 1),
 ('晴れやか', 1),
 ('晴れ晴れしい', 1),
 ('晴晴しい', 1),
 ('有望', 1),
 ('有為', 1),
 ('末たのもしい', 1),
 ('末頼もしい', 1),
 ('杲々たる', 1),
 ('杲杲たる', 1),
 ('楽しい', 1),
 ('楽しげ', 1),
 ('正直', 1),
 ('洋々たる', 1),
 ('洋洋たる', 1),
 ('淡々しい', 1),
 ('淡い', 1),
 ('淡淡しい', 1),
 ('炳たる', 1),
 ('煌々たる', 1),
 ('煌煌たる', 1),
 ('率直', 1),
 ('白い', 1),
 ('皎々たる', 1),
 ('皎たる', 1),
 ('皎然たる', 1),
 ('皎皎たる', 1),
 ('皓々たる', 1),
 ('皓皓たる', 1),
 ('直', 1),
 ('直ぐ', 1),
 ('直截', 1),
 ('真すぐ', 1),
 (

[('元気', 12),
 ('真面目', 10),
 ('まじめ', 8),
 ('熱心', 6),
 ('根性', 5),
 ('熱意', 5),
 ('いつも', 4),
 ('何時も', 4),
 ('情熱', 4),
 ('意気込み', 4),
 ('本気', 4),
 ('正直', 4),
 ('熱情', 4),
 ('真剣', 4),
 ('真正直', 4),
 ('誠実', 4),
 ('いつでも', 3),
 ('しょっちゅう', 3),
 ('たえず', 3),
 ('つねに', 3),
 ('何時でも', 3),
 ('始終', 3),
 ('常に', 3),
 ('常住坐臥', 3),
 ('常常', 3),
 ('意気', 3),
 ('気力', 3),
 ('気勢', 3),
 ('活力', 3),
 ('活発', 3),
 ('盛ん', 3),
 ('真すぐ', 3),
 ('真っすぐ', 3),
 ('真っ直ぐ', 3),
 ('真直ぐ', 3),
 ('精力的', 3),
 ('絶えず', 3),
 ('血気', 3),
 ('いき込み', 2),
 ('きまって', 2),
 ('きもっ玉', 2),
 ('つねづね', 2),
 ('まとも', 2),
 ('シリアス', 2),
 ('バイタリティ', 2),
 ('バイタリティー', 2),
 ('ヴァイタリティ', 2),
 ('ヴァイタリティー', 2),
 ('人となり', 2),
 ('公明正大', 2),
 ('四六時中', 2),
 ('実体', 2),
 ('実法', 2),
 ('実直', 2),
 ('常々', 2),
 ('年がら年じゅう', 2),
 ('年がら年中', 2),
 ('年がら年百', 2),
 ('年中', 2),
 ('年百年中', 2),
 ('度胸', 2),
 ('性', 2),
 ('性合', 2),
 ('性合い', 2),
 ('性情', 2),
 ('性根', 2),
 ('性格', 2),
 ('性質', 2),
 ('意気ごみ', 2),
 ('意気組み', 2),
 ('日夕', 2),
 ('明け暮れ', 2),
 ('明るい', 2),
 ('明暮', 2),
 ('根', 2),
 ('極って', 2),
 (

[('好青年', 0.684782862663269),
 ('負けず嫌い', 0.6846553683280945),
 ('目立ちたがり', 0.6845878958702087),
 ('天真爛漫', 0.6835500597953796),
 ('朗らか', 0.6829498410224915),
 ('健気', 0.6821495294570923),
 ('意地っ張り', 0.6791480779647827),
 ('負けん気', 0.6788445711135864),
 ('エネルギッシュ', 0.6749609708786011),
 ('子煩悩', 0.671172022819519),
 ('ひたむき', 0.6680941581726074),
 ('ぶっきらぼう', 0.6680686473846436),
 ('能天気', 0.6669535040855408),
 ('物静か', 0.6657835245132446),
 ('子供らしい', 0.6629055738449097),
 ('努力家', 0.6615105271339417),
 ('優しく', 0.6596875190734863),
 ('勝気', 0.6586257219314575),
 ('快活', 0.657821536064148),
 ('一生懸命', 0.6536130905151367)]

In [None]:
def list_neigbhors_from_synonyms(
    synonyms: Iterable[Tuple[str, int]],
    topn: int = 100
):
    multiplied_list = [[word for i in range(0,occurrence)] for (word, occurrence) in synonyms ]
    search_list = [word for word_pack in multiplied_list for word in word_pack]
    # display(search_list)
    return list_neigbhors_for_search_list(search_list, topn=topn)


In [22]:
def append_neigbhorness_lift_for_each_search(
    search_list: Iterable[str],
    word: str,
    neigbhorness: float
) -> List[Tuple[str, float, Tuple[float, float]]]:
    neigbohr_and_lift = []
    for search in search_list:
        similarity = model.wv.similarity(word, search)
        lift =  neigbhorness/similarity if similarity > 0 else 0
        neigbohr_and_lift += [(similarity, lift)]
    return (word, neigbhorness, neigbohr_and_lift)

In [23]:
def filter_neigbours_by_score_func(
    neigbhorness_lift: Iterable[Tuple[str, float, Tuple[float, float]]],
    score_func: Callable[float, Iterable[Tuple[float, float]]]):
    

SyntaxError: unexpected EOF while parsing (<ipython-input-23-71ec42b0a80e>, line 3)

SyntaxError: unexpected EOF while parsing (<ipython-input-14-bced9ec0ec98>, line 1)

In [0]:
searches = ["明るい", "性格"]
combi = [
   (word, score, [model.wv.similarity(word, search) for search in searches])
   for (word, score) in model.wv.most_similar(
       positive=searches,
       topn=100
    )
]
coocs = [*filter(None, [
    None if len([*filter(lambda x: x > score, search_scores)]) > 0 else
    (
        word,
        score,
        min([*map(lambda x:score/x, search_scores)]),
        search_scores,
    )
    for (word, score, search_scores) in combi]
)]
display("related")
display([*sorted(coocs, key=lambda x:-x[2])][0:50])