In [1]:
import gensim
import srsql
import pandas as pd
from tqdm.auto import tqdm
from statistics import harmonic_mean
from scipy import stats

In [2]:
import fasttext as ft

In [3]:
model = ft.load_model('models/fasttext_f10_10.bin')



In [62]:
# EV
# game & e-sports
# medical device
# edtech
# cloud-computing
# social-media
# internet-of-things
# fintech
# clean-water
pa_term = "social-media"

In [63]:
word = model.get_nearest_neighbors(pa_term, 500)

In [64]:
df_mo = pd.DataFrame(word, columns=['sim', 'word'])
df_mo

Unnamed: 0,sim,word
0,0.824547,political-media
1,0.797148,social-
2,0.789110,politico-media
3,0.779170,social-national
4,0.777701,social-based
...,...,...
495,0.599660,content-oriented
496,0.599635,political-party
497,0.599526,collectivism-individualism
498,0.599275,hypermedia


In [65]:
df_mo.to_csv("fasttext_result/" + pa_term + "_fasttext_score.csv", index=False)

In [66]:
w_list = list(df_mo["word"])

In [67]:
df_mo

Unnamed: 0,sim,word
0,0.824547,political-media
1,0.797148,social-
2,0.789110,politico-media
3,0.779170,social-national
4,0.777701,social-based
...,...,...
495,0.599660,content-oriented
496,0.599635,political-party
497,0.599526,collectivism-individualism
498,0.599275,hypermedia


# sql

## F10に登場する単語

In [10]:
connection = srsql.srsql('10years')
sql = "SELECT * FROM word_list"
with connection:
    with connection.cursor() as cur:
        cur.execute(sql, ())
        c_result = cur.fetchall()
df = pd.DataFrame(c_result)

## 単語-回数-企業

In [11]:
connection = srsql.srsql('10years')
sql = "SELECT * FROM count"
with connection:
    with connection.cursor() as cur:
        cur.execute(sql, ())
        c_result = cur.fetchall()
df_c = pd.DataFrame(c_result)

## 企業-総単語数

In [11]:
# connection = srsql.srsql('10years')
# sql = "SELECT * FROM company"
# with connection:
#     with connection.cursor() as cur:
#         cur.execute(sql, ())
#         c_result = cur.fetchall()
# df_sum = pd.DataFrame(c_result)

In [12]:
connection = srsql.srsql('10years')
sql = "SELECT * FROM count inner join company on count.company = company.company"
with connection:
    with connection.cursor() as cur:
        cur.execute(sql, ())
        c_result = cur.fetchall()
df_s = pd.DataFrame(c_result)

In [13]:
del c_result

# 指定単語で検索

## 単語を指定してください

In [34]:
# EV
# game & e-sports
# medical device
# edtech
# cloud-computing
# social-media
# internet-of-things
# fintech
# pa_term = "internet-of-things"

## 実行

word = model.wv.most_similar(pa_term, topn=2000)

num = 0
w_list = []
for i in tqdm(range(len(word))):
    a = word[i]
    b = a[0] in df.values
    if b == True:
        num += 1
        w_list.append(a)
        if num >= 500:
            break

In [68]:
# 親単語が出てくる企業
i = pa_term in df_s.values
if i == True:
    parents = df_s[df_s["word"] == pa_term]
parents.set_axis(list(parents["company"]), inplace=True)

In [69]:
parents

Unnamed: 0,word,count,company,id,company.company,sum
ANSS.OQ,social-media,1,ANSS.OQ,45,ANSS.OQ,69481
C.N,social-media,8,C.N,3686,C.N,8607659
ELA.A,social-media,3,ELA.A,2512,ELA.A,45715
GHC.N,social-media,8,GHC.N,1014,GHC.N,75756
WISH.OQ,social-media,2,WISH.OQ,4031,WISH.OQ,10908


In [70]:
k1 = 2
b1 = 0.75
CS = pd.DataFrame()
d1 = []
for i in tqdm(w_list): # iが対象としている子単語
    sa = df_mo.set_axis(df_mo["word"]).loc[i, 'sim']
    kids = df_s[df_s["word"] == i] # 子単語が出てくる企業
    kids.set_axis(list(kids["company"]), inplace=True)
    if len(list(kids["company"])) == 0:
        d1.append(i)
        continue
    else:
        sb = len(set(list(parents["company"])) & set(list(kids["company"]))) / len(list(kids["company"])) # sb
        fs = (2 * sa * sb) / (sa + sb) # fs
        tf = kids["count"] / kids["sum"]
        avg_dl = kids["sum"].sum() / len(list(kids["sum"]))
        cs = fs * tf * (k1 + 1) / (tf + k1 * (1 - b1 + b1 * tf / avg_dl)) # 
        CS = pd.concat([CS, cs], axis=1)
        CS.rename(columns = {0 : i}, inplace = True)

  0%|          | 0/500 [00:00<?, ?it/s]

In [71]:
sa

0.5992711782455444

## 対象単語保存

In [31]:
# w2v_score = pd.DataFrame(w_list, columns=['word', 'w2v'])

In [32]:
# w2v_score.to_csv("result/" + pa_term + "_w2v.csv", index=False)

## 保存

In [72]:
# os.chdir("C:\\Users\\SR\\python\\m1\\similarity\\")
CS.to_csv("fasttext_result/" + pa_term + "_fasttext.csv")

## 企業ランク付け

In [73]:
CS = CS.sum(axis=1).sort_values(ascending=False)

In [74]:
# 保存
CS.to_csv("fasttext_result/" + pa_term + "_fasttext_sum.csv")

# andとor検索(共通部分と全て)

In [199]:
# cloud computing
# social media
# internet things
# finance technology
# clean water
# sewer water
# iot cloud
p1 = "sewer"
p2 = "water"

In [200]:
word = model.get_nearest_neighbors(p1+p2, k=500)

In [201]:
word

[(0.8667479157447815, 'waterwater'),
 (0.8447642922401428, 'sewer'),
 (0.8342781066894531, 'water-sewer'),
 (0.8093318939208984, 'semerwater'),
 (0.8012778759002686, 'sewering'),
 (0.7978928685188293, 'sewers'),
 (0.7965499758720398, 'sewerside'),
 (0.7952089309692383, 'riverwater'),
 (0.7904564142227173, 'tywater'),
 (0.7871227860450745, 'elterwater'),
 (0.7827097177505493, 'overwater'),
 (0.7817373871803284, 'tierwater'),
 (0.7735006213188171, '-water'),
 (0.7732053995132446, 'sewer-pipes'),
 (0.7684579491615295, 'o-water'),
 (0.7680607438087463, 'sheerwater'),
 (0.7664675116539001, 'sewerman'),
 (0.7616521716117859, 'biwater'),
 (0.7582670450210571, 'sewerage'),
 (0.7574043869972229, 'water'),
 (0.7549451589584351, 'storm-water'),
 (0.7539404034614563, 'murkywater'),
 (0.7527543306350708, 'water-water'),
 (0.7509233355522156, 'tap-water'),
 (0.7473088502883911, 'underwater'),
 (0.746224045753479, 'stormwater'),
 (0.7453334331512451, 'zonderwater'),
 (0.7453058958053589, 'sewerby'),


In [202]:
df_mo = pd.DataFrame(word, columns=['sim', 'word'])
df_mo

Unnamed: 0,sim,word
0,0.866748,waterwater
1,0.844764,sewer
2,0.834278,water-sewer
3,0.809332,semerwater
4,0.801278,sewering
...,...,...
495,0.616630,water-conservation
496,0.616330,watering-hole
497,0.616142,water-operated
498,0.615764,drainpipe


In [203]:
df_mo.to_csv("fasttext_result/" + p1 + '_' + p2 + "_score.csv", index=False)

In [204]:
df_mo = df_mo.head(500)

In [205]:
w_list = list(df_mo["word"])

In [206]:
# 親単語1が出てくる企業
i = p1 in df_s.values
if i == True:
    parents1 = df_s[df_s["word"] == p1]
parents1.set_axis(list(parents1["company"]), inplace=True)

In [207]:
# 親単語2が出てくる企業
i = p2 in df_s.values
if i == True:
    parents2 = df_s[df_s["word"] == p2]
parents2.set_axis(list(parents2["company"]), inplace=True)

In [208]:
parents = set(list(set(parents1["company"]) & set(parents2["company"])))

In [209]:
k1 = 2
b1 = 0.75
CS = pd.DataFrame()
d1 = []
for i in tqdm(w_list): # 対象としている子単語
    sa = df_mo.set_axis(df_mo["word"]).loc[i, 'sim'] # 子単語のw2vスコア
    kids = df_s[df_s["word"] == i] # 子単語が出てくる企業
    kids.set_axis(list(kids["company"]), inplace=True)
    j = len(list(kids["company"]))
    if j == 0:
        d1.append(i)
        continue
    else:
        sb = len(parents & set(list(kids["company"]))) / j # sb
        fs = (2 * sa * sb) / (sa + sb) # fs
        tf = kids["count"] / kids["sum"]
        avg_dl = kids["sum"].sum() / len(list(kids["sum"]))
        cs = fs * tf * (k1 + 1) / (tf + k1 * (1 - b1 + b1 * tf / avg_dl)) # 
        CS = pd.concat([CS, cs], axis=1)
        CS.rename(columns = {0 : i}, inplace = True)

  0%|          | 0/500 [00:00<?, ?it/s]

## 保存

In [210]:
CS.to_csv("fasttext_result/" + p1 + '_and_' + p2 + ".csv")

In [211]:
CS = CS.sum(axis=1).sort_values(ascending=False)
CS.to_csv("fasttext_result/" + p1 + '_and_' + p2 + "_sum.csv")

# or検索(すべて)

In [212]:
parents = set(list(set(parents1["company"]) | set(parents2["company"])))

In [213]:
k1 = 2
b1 = 0.75
CS = pd.DataFrame()
d1 = []
for i in tqdm(w_list): # 対象としている子単語
    sa = df_mo.set_axis(df_mo["word"]).loc[i, 'sim'] # 子単語のw2vスコア
    kids = df_s[df_s["word"] == i] # 子単語が出てくる企業
    kids.set_axis(list(kids["company"]), inplace=True)
    j = len(list(kids["company"]))
    if j == 0:
        d1.append(i)
        continue
    else:
        sb = len(parents & set(list(kids["company"]))) / j # sb
        fs = (2 * sa * sb) / (sa + sb) # fs
        tf = kids["count"] / kids["sum"]
        avg_dl = kids["sum"].sum() / len(list(kids["sum"]))
        cs = fs * tf * (k1 + 1) / (tf + k1 * (1 - b1 + b1 * tf / avg_dl)) # 
        CS = pd.concat([CS, cs], axis=1)
        CS.rename(columns = {0 : i}, inplace = True)

  0%|          | 0/500 [00:00<?, ?it/s]

In [214]:
CS.to_csv("fasttext_result/" + p1 + '_or_' + p2 + ".csv")

In [215]:
CS = CS.sum(axis=1).sort_values(ascending=False)
CS.to_csv("fasttext_result/" + p1 + '_or_' + p2 + "_sum.csv")