# 补充属性

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel("data/data_with_features.xlsx")
df.head()

Unnamed: 0,date,contest_number,word,number_of_reported_results,number_in_hard_mode,1_try,2_tries,3_tries,4_tries,5_tries,...,letter_entropy,position_rarity,keyboard_distance,hamming_neighbors,has_common_suffix,has_common_prefix,letter_freq_mean,letter_freq_min,positional_freq_mean,positional_freq_min
0,2022-12-31,560,manly,20380,1899,0,2,17,37,29,...,2.321928,12.664545,4.376584,0,1,0,0.052925,0.030641,0.091365,0.05571
1,2022-12-30,559,molar,21204,1973,0,4,21,38,26,...,2.321928,17.343179,3.747676,1,0,0,0.066072,0.030641,0.081337,0.02507
2,2022-12-29,558,havoc,20001,1919,0,2,16,38,30,...,2.321928,35.581635,4.967999,0,0,0,0.051031,0.013928,0.049582,0.013928
3,2022-12-28,557,impel,20160,1937,0,3,21,40,25,...,2.321928,28.936672,4.731095,0,0,0,0.057604,0.030641,0.054039,0.016713
4,2022-12-27,556,condo,20879,2012,0,2,17,35,29,...,1.921928,22.207613,4.793787,0,0,0,0.05337,0.030084,0.070195,0.022284


## 语义领居数量

In [3]:
def load_glove_embeddings(glove_path):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            # 后面是向量
            vec = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vec
    print(f"Loaded {len(embeddings)} word vectors.")
    return embeddings

glove_path = "data/glove.6B/glove.6B.300d.txt"  # 替换为你的路径
embeddings = load_glove_embeddings(glove_path)

Loaded 400000 word vectors.


In [4]:
wordle_words = df['word'].tolist()

In [5]:
# 为每个单词取词向量
def get_vector(word, embeddings):
    w = word.lower()
    if w in embeddings:
        return embeddings[w]
    else:
        return None  # OOV 处理留给后面

word_vectors = {}
oov_words = []

for w in wordle_words:
    vec = get_vector(w, embeddings)
    if vec is not None:
        word_vectors[w] = vec
    else:
        oov_words.append(w)

print("OOV words:", oov_words)
print("Have vectors for:", len(word_vectors), "words")

OOV words: []
Have vectors for: 359 words


In [6]:
# 定义余弦相似度
def cosine_sim(vec1, vec2):
    num = np.dot(vec1, vec2)
    denom = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    if denom == 0.0:
        return 0.0
    return num / denom

In [None]:
tau = 0.6

words = list(word_vectors.keys())
vecs = [word_vectors[w] for w in words]

glove_words = list(embeddings.keys())
glove_vecs = [embeddings[w.lower()] for w in glove_words]

semantic_neighbors_count = {}

from tqdm import tqdm

for i, w in enumerate(tqdm(words)):
    v_w = vecs[i]
    count = 0
    for word, glove_vecs in embeddings.items():
        if w == word:
            continue
        sim = cosine_sim(v_w, glove_vecs)
        if sim >= tau:
            count += 1
    semantic_neighbors_count[w] = count

# 看看 EERIE 的语义邻居数
print("EERIE semantic neighbors:", semantic_neighbors_count.get("EERIE", None))

In [11]:
df["semantic_neighbors_count"] = df["word"].map(semantic_neighbors_count.get)

In [12]:
from heapq import nlargest

k = 10
semantic_density = {}

for i, w in enumerate(tqdm(words)):
    v_w = vecs[i]
    sims = []
    for word, glove_vecs in embeddings.items():
        if w == word:
            continue
        sims.append(cosine_sim(v_w, glove_vecs))
    topk = nlargest(k, sims)
    semantic_density[w] = float(sum(topk)) / k

100%|██████████| 359/359 [06:15<00:00,  1.04s/it]


In [13]:
df['semantic_density'] = df['word'].map(semantic_density.get)

In [14]:
df.to_excel("data/Copydata_with_features.xlsx", index=False)

In [None]:
df