In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 创建一个数据框来模拟用户的答题记录
np.random.seed(42)  # 保持结果的一致性

data = {
    "user_id": np.random.randint(1, 100, 500),
    "question_id": np.random.randint(1, 500, 500),
    "question_type": np.random.choice(["correct", "wrong", "unseen", "unlearned"], 500),
    "correct": np.random.randint(0, 2, 500),  # 1 表示对，0 表示错
}

df = pd.DataFrame(data)


# 利用LabelEncoder处理分类数据
le = LabelEncoder()
df["question_type_encoded"] = le.fit_transform(df["question_type"])


# 准备数据
X = df[["question_type_encoded"]]  # 特征
y = df["correct"]  # 标签

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 创建和训练模型
model = LogisticRegression()
model.fit(X_train, y_train)

# 预测和评估
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# 假设有新的题目类型数据
new_questions = pd.DataFrame(
    {"question_type": ["correct", "wrong", "unseen", "unlearned"]}
)

new_questions["question_type_encoded"] = le.transform(new_questions["question_type"])
new_predictions = model.predict_proba(new_questions[["question_type_encoded"]])

# 显示预测结果
prediction_df = pd.DataFrame(new_predictions, columns=["Prob_Wrong", "Prob_Correct"])
result_df = pd.concat([new_questions, prediction_df], axis=1)
print(result_df)


# 假设我们想要最大化正确答案的概率
result_df["Recommended_Probability"] = (
    result_df["Prob_Correct"] / result_df["Prob_Correct"].sum()
)
result_df["Recommended_Probability"] *= 100  # 转化为百分比
print(result_df[["question_type", "Recommended_Probability"]])


Accuracy: 0.54
  question_type  question_type_encoded  Prob_Wrong  Prob_Correct
0       correct                      0    0.527025      0.472975
1         wrong                      3    0.508935      0.491065
2        unseen                      2    0.514970      0.485030
3     unlearned                      1    0.521001      0.478999
  question_type  Recommended_Probability
0       correct                24.530998
1         wrong                25.469274
2        unseen                25.156259
3     unlearned                24.843470
