In [7]:
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

In [None]:
letters = [i for i in "abcdefghijklmnopqrstuvwxyz"]


def preprocess(content: str) -> str:
    # 文本预处理：全小写，移除标点，空格分隔
    text = ""
    for i in content.lower():
        if i in letters:
            text += i
        elif text[-1] != " ":
            text += " "
    return text


file_path: str = "text.txt"
with open(file_path, mode="r", encoding="utf-8") as f:
    plaintext: str = f.read()
plaintext = preprocess(plaintext)
print(len(plaintext))

20138


In [None]:
class crypt:
    def __init__(self, seed: int | None = None):
        if seed is not None:
            random.seed(seed)
        self.seed: int | None = seed
        self.key: list[str] = [i for i in "abcdefghijklmnopqrstuvwxyz"]
        random.shuffle(self.key)
        self.inv_key: list = [0 for _ in range(26)]
        for i in range(26):
            self.inv_key[ord(self.key[i]) - ord("a")] = chr(i + ord("a"))
        # key:     ['x','y',...] 明文 ab... 加密为 xy...
        # inv_key: ['x ,'y',...] 密文 ab... 解密为 xy...

    def reveal(self):
        return self.key, self.inv_key

    def encrypt(self, plain):
        # 加密：输入明文，输出密文
        cipher = ""
        for i in plain:
            if i == " ":
                cipher += i
            else:
                cipher += self.key[ord(i) - ord("a")]
        return cipher

    def decrypt(self, cipher, inv_key=None):
        # 解密：输入密文，输出明文
        # 默认使用真实的inv_key，也可自定
        plain = ""
        if inv_key is None:
            for i in cipher:
                if i == " ":
                    plain += i
                else:
                    plain += self.inv_key[ord(i) - ord("a")]
        else:
            for i in cipher:
                if i == " ":
                    plain += i
                else:
                    plain += inv_key[ord(i) - ord("a")]
        return plain


def match_freq(cipher):
    # 通过单个字母频率排序解密 (Baseline)
    # 输入：密文，输出：推断的解密密钥
    true_freq_order = [
        "e",
        "t",
        "a",
        "o",
        "i",
        "n",
        "s",
        "h",
        "r",
        "d",
        "l",
        "c",
        "u",
        "m",
        "w",
        "f",
        "g",
        "y",
        "p",
        "b",
        "v",
        "k",
        "j",
        "x",
        "q",
        "z",
    ]  # 真实字母频率排序（高到低）
    freq = defaultdict(int)
    for i in cipher:
        if i != " ":
            freq[i] += 1
    freq = sorted(dict(freq).items(), key=lambda x: x[1], reverse=True)
    freq_order = [i[0] for i in freq]

    compare = [i for i in zip(freq_order, true_freq_order)]
    compare = sorted(compare, key=lambda x: x[0])
    inv_key = [i[1] for i in compare]
    return inv_key


def evaluate(crypt: crypt, predict_inv_key: list[str], test_text: str) -> None:
    # 评价预测的解密密钥，打印准确率、例文
    t = 0
    for i in range(26):
        if crypt.inv_key[i] == predict_inv_key[i]:
            t += 1
    print(f"Accuracy: {round(100*t/26)}%")
    print(crypt.decrypt(crypt.encrypt(test_text)))
    print(crypt.decrypt(crypt.encrypt(test_text), predict_inv_key))
    return


A = crypt(42)
cipher = A.encrypt(plaintext[:7000])
evaluate(A, match_freq(cipher), "statistical machine learning")
evaluate(A, match_freq(cipher), preprocess("Attention !is all/you? need."))

Accuracy: 15%
statistical machine learning
htotahtarol wordase leoisasf
Accuracy: 15%
attention is all you need 
ottestans ah oll cng seem 


In [None]:
def extract_freq(cipher):
    # 输入密文或明文，输出频率比例 [a-z, aa-az, ba-bz, ... za-zz] 共702个
    letter_freq = [0] * 26
    conditional_freq = [[0] * 26 for _ in range(26)]

    length = 0
    prev_char = None
    for char in cipher:
        if char == " ":
            prev_char = None
            continue

        length += 1
        letter_freq[ord(char) - ord("a")] += 1
        if prev_char is not None:
            conditional_freq[ord(prev_char) - ord("a")][ord(char) - ord("a")] += 1
        prev_char = char
    """
    将字母频率和条件频率合并为一维列表（26 + 26*26 = 702个元素）
    顺序：先字母频率（a-z），再条件频率（aa, ab, ..., az, ba, bb, ..., zz）
    """

    flatten_freq = [count for row in conditional_freq for count in row]
    for i in range(len(flatten_freq)):
        if flatten_freq[i] != 0:
            flatten_freq[i] = round(flatten_freq[i] / letter_freq[int(i / 26)], 6)
    for i in range(26):
        letter_freq[i] = round(letter_freq[i] / length, 6)
    return letter_freq + flatten_freq


def generate_dataset(plaintext, n_crypt_seed, plaintext_seed=42):
    """
    生成包含n个种子的频率特征DataFrame
    参数：n - 整数，种子范围为0到n-1
    返回：pd.DataFrame - 共n行，列数=1（inv_key）+702（频率特征）=703列
    """
    # 固定随机数种子（确保截取明文的随机性可复现，可选）

    random.seed(plaintext_seed)
    start_idx = list()
    for _ in range(n_crypt_seed):
        start_idx.append(random.randint(0, len(plaintext) - 8001))

    data = list()
    for seed in tqdm(range(n_crypt_seed)):
        # 1. 创建加密实例（使用当前种子）
        cryptor = crypt(seed=seed)
        true_inv_key = cryptor.inv_key  # 获取真实逆密钥
        sub_plaintext = plaintext[
            start_idx[seed] : start_idx[seed] + 8000
        ]  # 截取8000长度明文

        # 3. 加密生成密文
        ciphertext = cryptor.encrypt(sub_plaintext)
        freq_features = extract_freq(ciphertext)

        # 5. 组合当前样本数据（逆密钥 + 702维特征）
        row_data = [true_inv_key] + freq_features
        data.append(row_data)

    # 6. 创建DataFrame
    # 列名：第一列为'inv_key'，后续为'freq_0'到'freq_701'
    columns = ["inv_key"] + [f"freq_{i}" for i in range(702)]
    df = pd.DataFrame(data, columns=columns)

    return df


test_df = generate_dataset(plaintext, 15000)
print("DataFrame形状：", test_df.shape)
print(test_df.head())
test_df.to_csv("freq_features_data.csv", index=False, encoding="utf-8")

100%|██████████| 15000/15000 [00:46<00:00, 319.19it/s]


DataFrame形状： (15000, 703)
                                             inv_key    freq_0    freq_1  \
0  [b, w, l, n, k, f, e, g, v, r, h, p, z, x, a, ...  0.016179  0.009500   
1  [n, f, x, v, z, g, q, o, w, k, d, c, r, j, t, ...  0.066943  0.018768   
2  [e, z, y, a, g, v, s, h, t, u, o, w, m, p, i, ...  0.121002  0.000442   
3  [q, e, r, m, w, f, a, z, n, i, h, v, l, j, d, ...  0.002059  0.120753   
4  [r, f, t, x, u, i, g, z, k, y, j, b, w, e, n, ...  0.070367  0.017960   

     freq_2    freq_3    freq_4    freq_5    freq_6    freq_7    freq_8  ...  \
0  0.036663  0.066202  0.010390  0.019000  0.117411  0.018851  0.008609  ...   
1  0.003103  0.008719  0.000591  0.022019  0.001478  0.065760  0.007537  ...   
2  0.034488  0.072808  0.022992  0.008696  0.062786  0.038909  0.089315  ...   
3  0.069863  0.032946  0.007795  0.017944  0.077953  0.000588  0.070599  ...   
4  0.084057  0.002355  0.024437  0.076697  0.022670  0.000442  0.005594  ...   

   freq_692  freq_693  freq_694  fre

In [None]:
# 错误内容
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import ast  # 新增：用于解析字符串列表

df = pd.read_csv("freq_features_data.csv")

# 新增：解析inv_key列，将字符串转回列表
df["inv_key"] = df["inv_key"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# 验证解析（可选，运行后注释掉）
print("解析后示例：", df["inv_key"].iloc[0][:5])  # 应输出 ['x', 'y', ...] 前5个

X = df[[f"freq_{i}" for i in range(702)]].values

# y: 转为26x26矩阵, e.g., y[i,j] = 1 if inv_key[j] maps to chr(i+ord('a'))
y_matrix = np.zeros((len(df), 26, 26))
for idx, inv_key in enumerate(df["inv_key"]):
    for pos in range(26):
        plain_char = inv_key[pos]  # 现在是字母
        if plain_char.isalpha() and plain_char.islower():  # 新增：安全检查
            plain_idx = ord(plain_char) - ord("a")
            y_matrix[idx, plain_idx, pos] = 1  # 行:明文, 列:密文位置
        else:
            print(f"警告：无效字符 at idx {idx}, pos {pos}: {plain_char}")  # 调试用

y = y_matrix.reshape(len(df), 26 * 26)  # 扁平化为676维多输出

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)  # 加random_state复现
model = MultiOutputClassifier(DecisionTreeClassifier(max_depth=10))  # 防过拟合
model.fit(X_train, y_train)
preds = model.predict(X_test)
acc = accuracy_score(y_test, preds)  # 多输出acc（每个位置的one-hot准确率）
print(f"DT Accuracy: {acc:.2%}")

## 尝试Logistic regression （as baseline？）

In [None]:
# 对于行 c=0..25，各自训练一个 26分类器：X -> y[:, c]
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clfs = []
y_pred_cols = []
for c in range(26):
    clf = LogisticRegression(max_iter=1000, n_jobs=None, multi_class='multinomial')  # 简单&稳
    clf.fit(X_tr, y_tr[:, c])
    clfs.append(clf)
    y_pred_cols.append(clf.predict(X_va))

y_pred = np.stack(y_pred_cols, axis=1)  # (Nval, 26)
pos_acc = (y_pred == y_va).mean()
all_acc = (y_pred == y_va).all(axis=1).mean()
print(f"Val(pos-avg)={pos_acc:.3f}, Val(all-correct)={all_acc:.3f}")


  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  raw_pred

Val(pos-avg)=0.632, Val(all-correct)=0.000


  ret = a @ b
  ret = a @ b
  ret = a @ b


### 查看1500个 samples 平均的val

In [None]:
print(f"Val(pos-avg)={pos_acc:.3f}, Val(all-correct)={all_acc:.3f}")
# 错误代码行:
print(np.shape(y_pred_cols))

Val(pos-avg)=0.632, Val(all-correct)=0.000
(26, 1500)


### 找出1500个里最强的解码（may not be stable）

In [None]:

# 假设 y_pred 和 y_va 都是 (Nval, 26) 的 NumPy 数组

# 1. 找出每个样本有多少个预测是正确的 (一个布尔数组，形状为 (Nval, 26))
correct_predictions = (y_pred == y_va)

# 2. 统计每个样本 (行) 预测正确的字母数量
# 结果是长度为 Nval 的数组，每个元素是该样本正确预测的个数 (0-26)
num_correct_chars_per_sample = correct_predictions.sum(axis=1)

# 3. 找到拥有最多正确预测的样本的索引 (即“最强”的那一行)
best_sample_idx = np.argmax(num_correct_chars_per_sample)
max_correct_chars = num_correct_chars_per_sample[best_sample_idx]

# 4. 提取该样本的预测密钥索引
best_predicted_key_indices = y_pred[best_sample_idx].tolist()

# 5. 解码：将索引转换为字母
predicted_key_letters = ''.join([chr(i + ord('a')) for i in best_predicted_key_indices])

# 6. (可选) 提取该样本的真实密钥 (Ground Truth) 进行对比
best_true_key_indices = y_va[best_sample_idx].tolist()
true_key_letters = ''.join([chr(i + ord('a')) for i in best_true_key_indices])


print(f"在验证集中，模型预测最强的样本是第 {best_sample_idx} 行 (从0开始计数)。")
print(f"该样本有 {max_correct_chars} 个密文字母预测正确，准确率为 {max_correct_chars / 26 * 100:.2f}%。")

print("\n--- 预测密钥 (Prediction) ---")
print(f"真实明文字母 (Plain): {true_key_letters}")
print(f"预测明文字母 (Plain): {predicted_key_letters}")

在验证集中，模型预测最强的样本是第 395 行 (从0开始计数)。
该样本有 24 个密文字母预测正确，准确率为 92.31%。

--- 预测密钥 (Prediction) ---
真实明文字母 (Plain): sxaioermlvjqyhgfkcbunzpwdt
预测明文字母 (Plain): sxaioermlvjjyhgfkcbunzpwdn


### 随机一个regression的sample的正确率

In [None]:
# 假设 y_pred 是 (Nval, 26) 形状的索引数组
# 1. 提取第一个样本 (第0行) 的索引列表
pred_idx = y_pred[0].tolist()  

# 2. 将索引转换为字母
predict_inv_key = [chr(i + ord('a')) for i in pred_idx]

# 打印结果：
print(f"第一个样本的预测密钥：{''.join(predict_inv_key)}")

第一个样本的预测密钥：ljxmrinhkzwbdidcupqjmteayr


In [None]:
evaluate(A, predict_inv_key, "statistical machine learning")


Accuracy: 0%
statistical machine learning
yrurcyrczuq juzwcdj qjupdcdi
