# 补充属性

## 语义难度

In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("data/mcm_processed_data.csv")
df.head()

Unnamed: 0,date,contest_number,word,number_of_reported_results,number_in_hard_mode,1_try,2_tries,3_tries,4_tries,5_tries,...,letter_freq_mean,letter_freq_min,positional_freq_mean,positional_freq_min,semantic_neighbors_count,semantic_density,Zipf-value,autoencoder_value,expected_attempts,semantic_difficulty
0,2022/12/31,560,manly,20380,1899,0,2,17,37,29,...,0.052925,0.030641,0.091365,0.05571,0,0.350035,3.607804,-0.406941,4.42,0.574506
1,2022/12/30,559,molar,21204,1973,0,4,21,38,26,...,0.066072,0.030641,0.081337,0.02507,0,0.329278,2.723198,-0.631226,4.2,0.711385
2,2022/12/29,558,havoc,20001,1919,0,2,16,38,30,...,0.051031,0.013928,0.049582,0.013928,1,0.402676,3.136932,-0.381673,4.44,-0.585251
3,2022/12/28,557,impel,20160,1937,0,3,21,40,25,...,0.057604,0.030641,0.054039,0.016713,0,0.334106,1.893894,-0.623626,4.21,0.679545
4,2022/12/27,556,condo,20879,2012,0,2,17,35,29,...,0.05337,0.030084,0.070195,0.022284,0,0.408353,3.626288,-0.326174,4.51,0.189932


## 语义领居数量

In [5]:
def load_glove_embeddings(glove_path):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            # 后面是向量
            vec = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vec
    print(f"Loaded {len(embeddings)} word vectors.")
    return embeddings

glove_path = "data/glove.6B/glove.6B.300d.txt"  # 替换为你的路径
embeddings = load_glove_embeddings(glove_path)

Loaded 400000 word vectors.


In [6]:
wordle_words = df['word'].tolist()

In [7]:
# 为每个单词取词向量
def get_vector(word, embeddings):
    w = word.lower()
    if w in embeddings:
        return embeddings[w]
    else:
        return None  # OOV 处理留给后面

word_vectors = {}
oov_words = []

for w in wordle_words:
    vec = get_vector(w, embeddings)
    if vec is not None:
        word_vectors[w] = vec
    else:
        oov_words.append(w)

print("OOV words:", oov_words)
print("Have vectors for:", len(word_vectors), "words")

OOV words: []
Have vectors for: 358 words


In [8]:
# 定义余弦相似度
def cosine_sim(vec1, vec2):
    num = np.dot(vec1, vec2)
    denom = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    if denom == 0.0:
        return 0.0
    return num / denom

In [None]:
tau = 0.5

words = list(word_vectors.keys())
vecs = [word_vectors[w] for w in words if len(w) == 5]

glove_words = list(embeddings.keys())
glove_vecs = [embeddings[w.lower()] for w in glove_words if len(w) == 5]

In [10]:
len(vecs)

358

In [11]:
semantic_neighbors_count = {}

from tqdm import tqdm

for i, w in enumerate(tqdm(words)):
    v_w = vecs[i]
    count = 0
    for word, glove_vecs in embeddings.items():
        if len(word) != 5:
            continue
        if w == word:
            continue
        sim = cosine_sim(v_w, glove_vecs)
        if sim >= tau:
            count += 1
    semantic_neighbors_count[w] = count


100%|██████████| 358/358 [01:14<00:00,  4.81it/s]


In [12]:
df["semantic_neighbors_count"] = df["word"].map(semantic_neighbors_count.get)

In [13]:
from heapq import nlargest

k = 10
semantic_density = {}

for i, w in enumerate(tqdm(words)):
    v_w = vecs[i]
    sims = []
    for word, glove_vecs in embeddings.items():
        if len(word) != 5:
            continue
        if w == word:
            continue
        sims.append(cosine_sim(v_w, glove_vecs))
    topk = nlargest(k, sims)
    semantic_density[w] = float(sum(topk)) / k

100%|██████████| 358/358 [01:03<00:00,  5.60it/s]


In [14]:
df['semantic_density'] = df['word'].map(semantic_density.get)

In [15]:
neighbors_log = np.log1p(df['semantic_neighbors_count'])
neighbors_z = (neighbors_log - neighbors_log.mean()) / neighbors_log.std()
density_z = (df['semantic_density'] - df['semantic_density'].mean()) / df['semantic_density'].std()
df['semantic_difficulty'] = -(0.5 * neighbors_z + 0.5 * density_z)

In [16]:
df.to_csv("data/mcm_processed_data.csv", index=False)

## 词频难度

In [None]:
df = pd.read_excel("data/mcm_processed_data.csv.xlsx")
df.head()

In [None]:
dt = pd.read_excel("data/SUBTLEX-US frequency list with PoS and Zipf information.xlsx")
dt.head()

In [None]:
# 假设 df 的列是 'word'，dt 的列是 'Word' 和 'Zipf-value'
df = df.merge(
    dt[['Word', 'Zipf-value']],
    left_on='word',
    right_on='Word',
    how='left'
)

# 如果不需要保留 'Word' 这一列，可以删掉
df = df.drop(columns=['Word'])

In [None]:
neighbors_log = np.log1p(df['semantic_neighbors_count'])
neighbors_z = (neighbors_log - neighbors_log.mean()) / neighbors_log.std()
density_z = (df['semantic_density'] - df['semantic_density'].mean()) / df['semantic_density'].std()
df['semantic_difficulty'] = -(0.5 * neighbors_z + 0.5 * density_z)

In [None]:
df.head()

In [None]:
df.to_excel("data/Copydata_with_features.xlsx", index=False)

## 汉明距离

In [None]:
with open ('data/allowed.txt', 'r') as f:
    allowed = f.read().splitlines()
allowed = [word.lower().strip() for word in allowed]
print(allowed)

In [None]:
def hamming_distance(w1, w2):
    # 定义汉明距离函数
    return sum(c1 != c2 for c1, c2 in zip(w1, w2))

In [None]:
neighbors = {}

for w in df['word'].tolist():
    cnt = 0
    for v in allowed:
        if w != v and hamming_distance(w, v) == 1:
            cnt += 1

    neighbors[w] = cnt

print(neighbors)

In [None]:
# 将 neighbors 字典映射到 df，生成一列
df['hamming_neighbors'] = df['word'].map(neighbors)
df.to_csv('data/mcm_processed_data.csv', index=False)

## Lasso回归分析 - 变量重要性

In [None]:
# 加载数据
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
df = pd.read_csv("data/mcm_processed_data.csv")

# 定义需要排除的列
exclude_cols = ['date', 'contest_number', 'word', 'number_of_reported_results', 'number_in_hard_mode',
                '1_try', '2_tries', '3_tries', '4_tries', '5_tries', '6_tries', '7_or_more_tries_x', 
                'sum', 'autoencoder_value', 'expected_attempts']

# 选择特征列
feature_cols = [col for col in df.columns if col not in exclude_cols]
print("特征变量:", feature_cols)
print(f"\n共 {len(feature_cols)} 个特征变量")

In [None]:
from sklearn.linear_model import LassoCV, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

# 准备数据
X = df[feature_cols].copy()
y = df['autoencoder_value'].copy()

# 检查目标变量
print("目标变量统计:")
print(y.describe())
print(f"\n缺失值数量: {y.isna().sum()}")

# 处理缺失值
X = X.fillna(X.median())
y = y.fillna(y.median())

# 标准化特征
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"\n特征矩阵形状: {X_scaled.shape}")
print(f"目标变量形状: {y.shape}")

In [None]:
# 使用交叉验证选择最优的alpha参数
alphas_to_try = np.logspace(-8, -1, 100)
lasso_cv = LassoCV(alphas=alphas_to_try, cv=5, random_state=42, max_iter=10000)
lasso_cv.fit(X_scaled, y)

print(f"交叉验证选择的最优 alpha: {lasso_cv.alpha_:.8f}")
print(f"R² 得分 (最优alpha): {lasso_cv.score(X_scaled, y):.4f}")

# 测试不同alpha值的效果
print("\n不同alpha值的模型效果:")
for alpha in [0.0001, 0.001, 0.005, 0.007, 0.008, 0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1]:
    lasso_test = Lasso(alpha=alpha, max_iter=10000)
    lasso_test.fit(X_scaled, y)
    n_nonzero = (lasso_test.coef_ != 0).sum()
    r2 = lasso_test.score(X_scaled, y)
    print(f"  alpha={alpha:.4f}: R²={r2:.4f}, 非零系数={n_nonzero}")

# 选择一个能保留足够特征的alpha进行分析
analysis_alpha = lasso_cv.alpha_
lasso_final = Lasso(alpha=analysis_alpha, max_iter=10000)
lasso_final.fit(X_scaled, y)
print(f"\n最终选择 alpha={analysis_alpha} 进行变量重要性分析")
print(f"R² 得分: {lasso_final.score(X_scaled, y):.4f}")
print(f"非零系数数量: {(lasso_final.coef_ != 0).sum()}/{len(feature_cols)}")

In [None]:
# 获取特征系数并排序
coef_df = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': lasso_final.coef_,
    'abs_coefficient': np.abs(lasso_final.coef_)
}).sort_values('abs_coefficient', ascending=False)

# 显示所有特征的重要性
print("=" * 60)
print(f"Lasso回归特征重要性排序 (alpha={analysis_alpha})")
print("=" * 60)
for i, row in coef_df.iterrows():
    status = "✓" if row['coefficient'] != 0 else "✗"
    print(f"{status} {row['feature']:30s} : {row['coefficient']:+.6f}")

print("\n" + "=" * 60)
print(f"非零系数特征数: {(coef_df['coefficient'] != 0).sum()} / {len(feature_cols)}")
print("=" * 60)

In [None]:
# 可视化特征重要性
plt.rcParams['font.family'] = 'Heiti TC'  # 替换为你选择的字体
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# 只显示非零系数的特征
nonzero_df = coef_df[coef_df['coefficient'] != 0].copy()

# 图1: 按系数值排序的条形图
ax1 = axes[0]
colors = ['green' if x > 0 else 'red' for x in nonzero_df['coefficient']]
bars = ax1.barh(range(len(nonzero_df)), nonzero_df['coefficient'], color=colors, alpha=0.7)
ax1.set_yticks(range(len(nonzero_df)))
ax1.set_yticklabels(nonzero_df['feature'])
ax1.set_xlabel('Lasso Coefficient')
ax1.set_title('Lasso回归系数 (非零特征)')
ax1.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
ax1.invert_yaxis()

# 图2: 按绝对值排序的条形图
ax2 = axes[1]
sorted_by_abs = nonzero_df.sort_values('abs_coefficient', ascending=True)
colors2 = ['green' if x > 0 else 'red' for x in sorted_by_abs['coefficient']]
ax2.barh(range(len(sorted_by_abs)), sorted_by_abs['abs_coefficient'], color=colors2, alpha=0.7)
ax2.set_yticks(range(len(sorted_by_abs)))
ax2.set_yticklabels(sorted_by_abs['feature'])
ax2.set_xlabel('|Lasso Coefficient|')
ax2.set_title('特征重要性 (按绝对值排序)')

plt.tight_layout()
plt.savefig('data/lasso_feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n图例: 绿色=正相关, 红色=负相关")

In [None]:
# 不同alpha值下的系数路径图
alphas = np.logspace(-5, -1, 50)
coefs = []

for alpha in alphas:
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X_scaled, y)
    coefs.append(lasso.coef_)

coefs = np.array(coefs)

# 绘制系数路径
plt.figure(figsize=(12, 6))
for i, feature in enumerate(feature_cols):
    plt.plot(alphas, coefs[:, i], label=feature, linewidth=1.5)

plt.xscale('log')
plt.xlabel('Alpha (正则化强度)')
plt.ylabel('系数值')
plt.title('Lasso回归系数路径图')
plt.axvline(x=analysis_alpha, color='black', linestyle='--', label=f'选定alpha={analysis_alpha}')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
plt.tight_layout()
plt.savefig('data/lasso_coefficient_path.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# 计算所有构建指标的皮尔森相关性
import seaborn as sns
import matplotlib.pyplot as plt

# 选择数值特征列（排除非数值列和ID列）
exclude_cols = ['date', 'contest_number', 'word', 'number_of_reported_results', 'number_in_hard_mode',
                '1_try', '2_tries', '3_tries', '4_tries', '5_tries', '6_tries', '7_or_more_tries_x', 
                'sum']
numeric_features = [col for col in df.columns if col not in exclude_cols and df[col].dtype in ['float64', 'int64']]

print(f"用于相关性分析的数值特征 ({len(numeric_features)} 个):")
for i, col in enumerate(numeric_features, 1):
    print(f"{i:2d}. {col}")

# 计算相关性矩阵
correlation_matrix = df[numeric_features].corr()

print(f"\n相关性矩阵形状: {correlation_matrix.shape}")
print("\n相关性矩阵前5x5:")
print(correlation_matrix.iloc[:5, :5].round(3))

In [None]:
# 创建相关性热力图
plt.figure(figsize=(16, 14))

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# 创建热力图
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))  # 只显示下三角
sns.heatmap(correlation_matrix, 
            mask=mask,
            annot=True, 
            cmap='coolwarm', 
            center=0,
            square=True,
            fmt='.2f',
            cbar_kws={"shrink": 0.8},
            annot_kws={'size': 8})

plt.title('所有构建指标间的皮尔森相关性矩阵', fontsize=16, pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()

# 保存图片
plt.savefig('data/correlation_heatmap.png', dpi=600, bbox_inches='tight')
plt.show()

print("相关性热力图已保存到: data/correlation_heatmap.png")

In [None]:
# 找出高相关性特征对
print("=" * 80)
print("高相关性特征对分析 (|r| ≥ 0.5)")
print("=" * 80)

high_corr_pairs = []

for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) >= 0.5:  # 阈值设为0.5
            feature1 = correlation_matrix.columns[i]
            feature2 = correlation_matrix.columns[j]
            high_corr_pairs.append((feature1, feature2, corr_value))

# 按相关性绝对值排序
high_corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)

print(f"找到 {len(high_corr_pairs)} 对高相关性特征:\n")

for feature1, feature2, corr_value in high_corr_pairs:
    direction = "正相关" if corr_value > 0 else "负相关"
    print(f"{feature1:30s} ↔ {feature2:30s}: {corr_value:+.3f} ({direction})")

# 特别关注语义相关指标
print("\n" + "=" * 80)
print("语义相关指标与其他特征的相关性:")
print("=" * 80)

semantic_features = ['semantic_neighbors_count', 'semantic_density', 'semantic_difficulty', 'Zipf-value']

for feature in semantic_features:
    if feature in correlation_matrix.columns:
        correlations = correlation_matrix[feature].sort_values(key=abs, ascending=False)
        print(f"\n{feature} 的相关性排序 (前10个):")
        for other_feature, corr_value in correlations.head(10).items():
            if other_feature != feature:
                direction = "正相关" if corr_value > 0 else "负相关"
                print(f"  {other_feature:30s}: {corr_value:+.3f} ({direction})")

In [None]:
# 保存相关性分析结果
import os

# 创建输出目录
os.makedirs('data', exist_ok=True)

# 保存相关性矩阵到Excel
correlation_matrix.to_excel('data/correlation_matrix.xlsx')
print("相关性矩阵已保存到: data/correlation_matrix.xlsx")

# 保存高相关性对到文本文件
with open('data/high_correlation_pairs.txt', 'w', encoding='utf-8') as f:
    f.write("高相关性特征对分析 (|r| ≥ 0.5)\n")
    f.write("=" * 80 + "\n\n")
    
    f.write(f"找到 {len(high_corr_pairs)} 对高相关性特征:\n\n")
    
    for feature1, feature2, corr_value in high_corr_pairs:
        direction = "正相关" if corr_value > 0 else "负相关"
        f.write(f"{feature1:30s} ↔ {feature2:30s}: {corr_value:+.3f} ({direction})\n")
    
    f.write("\n" + "=" * 80 + "\n")
    f.write("语义相关指标与其他特征的相关性:\n")
    f.write("=" * 80 + "\n\n")
    
    for feature in semantic_features:
        if feature in correlation_matrix.columns:
            correlations = correlation_matrix[feature].sort_values(key=abs, ascending=False)
            f.write(f"\n{feature} 的相关性排序 (前10个):\n")
            for other_feature, corr_value in correlations.head(10).items():
                if other_feature != feature:
                    direction = "正相关" if corr_value > 0 else "负相关"
                    f.write(f"  {other_feature:30s}: {corr_value:+.3f} ({direction})\n")

print("高相关性分析结果已保存到: data/high_correlation_pairs.txt")

# 统计信息
print(f"\n相关性分析总结:")
print(f"- 总特征数: {len(numeric_features)}")
print(f"- 相关性矩阵大小: {correlation_matrix.shape}")
print(f"- 高相关性特征对数 (|r| ≥ 0.5): {len(high_corr_pairs)}")
print(f"- 极高相关性特征对数 (|r| ≥ 0.8): {sum(1 for _, _, r in high_corr_pairs if abs(r) >= 0.8)}")

In [None]:
# 汇总表格
print("\n" + "=" * 70)
print("Lasso回归分析结果汇总")
print("=" * 70)

summary_df = coef_df[['feature', 'coefficient', 'abs_coefficient']].copy()
summary_df['rank'] = range(1, len(summary_df) + 1)
summary_df['selected'] = summary_df['coefficient'] != 0
summary_df = summary_df[['rank', 'feature', 'coefficient', 'abs_coefficient', 'selected']]
summary_df.columns = ['排名', '特征变量', '系数', '系数绝对值', '被选择']

print(summary_df.to_string(index=False))

# 保存结果
summary_df.to_excel('data/lasso_feature_importance.xlsx', index=False)
print("\n结果已保存到 data/lasso_feature_importance.xlsx")

## Lasso回归分析结论

### 重要发现

**最重要的正相关变量** (系数为正，autoencoder_value增大时变量值增大):
1. **semantic_density** (0.0325): 语义密度越高，autoencoder_value越大
2. **positional_freq_min** (0.0307): 最小位置字母频率越高越重要
3. **has_common_prefix** (0.0259): 有常见前缀的单词
4. **ends_with_vowel** (0.0159): 以元音结尾
5. **starts_with_vowel** (0.0142): 以元音开头

**最重要的负相关变量** (系数为负):
1. **semantic_neighbors_count** (-0.0347): 语义邻居数量越多，autoencoder_value越小
2. **num_vowels** (-0.0248): 元音数量越多越低
3. **contains_y** (-0.0199): 包含字母Y
4. **hamming_neighbors** (-0.0154): 汉明距离邻居数
5. **num_multiple_letters** (-0.0154): 重复字母数量

**被Lasso筛除的变量** (系数为0，相对不重要):
- unique_letters, has_repeats, num_rare_letters, scrabble_score
- letter_entropy, position_rarity, keyboard_distance
- has_common_suffix, has_double_letter, letter_freq_mean

### 注意事项
- 整体模型R²较低 (~0.03)，说明autoencoder_value的变异主要由其他未包含的因素解释
- Lasso回归的特征选择功能帮助识别了16个相对重要的变量