In [8]:
#　モデルのロード

from gensim.models.keyedvectors import KeyedVectors

# 単語ベクトルモデルの読み込み
model_path = './GoogleNews-vectors-negative300.bin.gz'  # モデルのパス
model = KeyedVectors.load_word2vec_format(model_path, binary=True)  # binary=Trueはバイナリ形式のモデルの場合

In [83]:
import polars as pl

FILE_PATH = "./wordsim353/combined.csv"
df = pl.read_csv(FILE_PATH, encoding="utf-8")
df = df.with_columns((df['Human (mean)'].rank()+1).alias('Rank_Human'))

# word1と2の類似度を計算して、新しい列として追加
df1 = df["Word 1"]
df2 = df["Word 2"]
similarity = []
for w1,w2 in zip(df1,df2):
    sim_score = model.similarity(w1, w2)
    similarity.append(sim_score)
df = df.with_columns(similarity=pl.Series(similarity))

# similarityにもrankをつけていく
df = df.with_columns((df['similarity'].rank()+1).alias('Rank_similarity'))

# d^2の計算
df = df.with_columns(((df['Rank_Human']-df['Rank_similarity'])**2).alias('d^2'))
sum_d2 = df["d^2"].sum()
N = len(df['d^2'])
Spearman = 1 - (6*sum_d2)/(N*(N**2-1))
print(f"スピアマン係数: {Spearman}")
df

スピアマン係数: 0.7000217838950313


Word 1,Word 2,Human (mean),Rank_Human,similarity,Rank_similarity,d^2
str,str,f64,f64,f32,f64,f64
"""love""","""sex""",6.77,211.5,0.263938,191.0,420.25
"""tiger""","""cat""",7.35,245.5,0.517296,317.0,5112.25
"""tiger""","""tiger""",10.0,354.0,1.0,354.0,0.0
"""book""","""paper""",7.46,256.5,0.363463,260.0,12.25
"""computer""","""keyboard""",7.62,273.0,0.396392,274.0,1.0
…,…,…,…,…,…,…
"""shower""","""flood""",6.03,161.0,0.129479,83.0,6084.0
"""weather""","""forecast""",8.34,319.5,0.362721,258.0,3782.25
"""disaster""","""area""",6.25,173.5,0.145228,99.0,5550.25
"""governor""","""office""",6.34,181.5,0.296636,213.0,992.25
