# 死刑舆论情感分析和倾向分析

In [None]:
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt

# 设置图片清晰度
plt.rcParams['figure.dpi'] = 300

# 正常显示中文，设置字体为宋体
plt.rcParams['font.sans-serif'] = ['SimSun']
# 加载数据
df = pd.read_csv("D:\\HuaweiMoveData\\Users\\32549\\OneDrive\\twitter_capital_data.csv")


In [None]:
# 使用 TextBlob 进行情感分析
def get_sentiment_score(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

# 应用函数到 description 列，获取情感得分
df['sentiment_score'] = df['description'].apply(get_sentiment_score)

# 定义一个函数来对情感得分进行分类
def get_sentiment_label(score):
    if score > 0.05:
        return 'Positive'
    elif score < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# 应用函数到 sentiment_score 列，获取情感标签
df['sentiment_label'] = df['sentiment_score'].apply(get_sentiment_label)

# 查看总体情感趋向分布情况
sentiment_distribution = df['sentiment_label'].value_counts(normalize=True) * 100

# 输出结果（保留两位小数）
print(sentiment_distribution.round(2))



# 绘制情感得分直方图
plt.figure()
plt.hist(df['sentiment_score'], bins=30, edgecolor='black')
plt.title('情感得分分布直方图')
plt.xlabel('情感得分')
plt.xticks(rotation=45)
plt.ylabel('频数')

# 显示图表
plt.show()

In [None]:
# 将 datetime 列转换为日期时间类型
df['datetime'] = pd.to_datetime(df['datetime'])

# 按年月统计不同情感的数量
time_series_data = df.groupby([df['datetime'].dt.to_period('M'), 'sentiment_label']).size().unstack(fill_value=0)

# 绘制时间序列图


ax = time_series_data.plot(title='死刑舆情随时间的变化趋势')
ax.set_xlabel('时间')
ax.set_ylabel('数量')
ax.legend(title='情感倾向')

# 显示图表
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv("D:\\HuaweiMoveData\\Users\\32549\\OneDrive\\twitter_capital_data.csv")
df["datetime"] = pd.to_datetime(df["datetime"])
df["month_str"] = df["datetime"].dt.strftime("%Y-%m")
df["description"] = df["description"].fillna("").str.lower()

# 按月合并文本
monthly_texts = df.groupby("month_str")["description"].apply(lambda x: " ".join(x)).reset_index()

# 提取关键词频率
vectorizer = CountVectorizer(stop_words="english", max_features=20)
monthly_keywords = []


for text in monthly_texts["description"]:
    X = vectorizer.fit_transform([text])
    monthly_keywords.append(dict(zip(vectorizer.get_feature_names_out(), X.toarray()[0])))

print(monthly_keywords)

# 构建 DataFrame（行为月份，列为关键词）
keywords_df = pd.DataFrame(monthly_keywords)
keywords_df["month"] = monthly_texts["month_str"]
keywords_df.set_index("month", inplace=True)
keywords_df.fillna(0, inplace=True)

# 画堆积图
keywords_df.plot(kind="bar", stacked=True, figsize=(16, 8), colormap="tab20")
plt.title("每月死刑话题关键词频率变化图")
plt.xlabel("月份")
plt.ylabel("词频")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()





In [None]:
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import matplotlib.pyplot as plt

# 读取数据
df = pd.read_csv("D:\\HuaweiMoveData\\Users\\32549\\OneDrive\\twitter_capital_data.csv")
df["description"] = df["description"].fillna("").str.lower()

# 可选：你自己标注 is_sensational 为 True 的行，或基于关键词筛选
keywords = ["execution", "innocent", "appeal", "cop", "rape", "mass shooting", 
            "black", "racial", "jury", "wrongfully", "death row", "victim", "kill", "child", "judge"]

df["is_sensational"] = df["description"].apply(lambda x: any(k in x for k in keywords))

# 提取高关注文本
texts = df[df["is_sensational"] == True]["description"].dropna()
full_text = " ".join(texts)

# 提取词频
vectorizer = CountVectorizer(stop_words='english', max_features=100)
X = vectorizer.fit_transform([full_text])
word_freq = dict(zip(vectorizer.get_feature_names_out(), X.toarray()[0]))
print(word)

# 生成词云
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='inferno')
wordcloud.generate_from_frequencies(word_freq)

# 可视化
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("死刑重大事件关键词词云")
plt.show()


In [None]:
## 展示每月的数据

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
import re

#
df = pd.read_csv("D:\\HuaweiMoveData\\Users\\32549\\OneDrive\\twitter_capital_data.csv")
# 自定义停用词列表
stop_words = set([
    'the', 'and', 'to', 'of', 'a', 'in', 'that', 'it', 'with', 'as', 'for', 'on',
    'is', 'are', 'be', 'was', 'were', 'by', 'at', 'this', 'from', 'or', 'an', 'have',
    'not', 'but', 'has', 'had', 'its', 'their', 'they', 'we', 'you', 'i', 'he', 'she',
    'his', 'her', 'him', 'our', 'your', 'all', 'any', 'no', 'will', 'would', 'can',
    'could', 'may', 'might', 'should', 'these', 'those', 'am', 'been', 'being', 'do',
    'does', 'did', 'so', 'such', 'than', 'then', 'there', 'here', 'when', 'where',
    'why', 'how', 'what', 'which', 'who', 'whom', 'into', 'about', 'between', 'through',
    'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
    'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where',
    'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
    'such', 'nor', 'only', 'own', 'same', 'so', 'than', 'too', 'very','s', 't', 'just',
    'don', 'should', 'now', 'd', 'll','m', 'o','re', 've', 'y', 'ain', 'aren', 'couldn',
    'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn','ma','mightn','mustn', 'needn',
    'shan','shouldn', 'wasn', 'weren', 'won', 'wouldn'
])

# 文本预处理函数
def preprocess_text(text):
    # 转换为小写
    text = text.lower()
    # 移除特殊字符和数字
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # 分词并去除停用词
    tokens = [word for word in text.split() if word not in stop_words]
    # 简单词干化（去除复数形式）
    tokens = [word.rstrip('s') for word in tokens]
    return tokens

# 合并 description 和 title 列的文本
combined_text = df['description'] +' '+ df['title']

# 将日期列转换为 datetime 类型
df['datetime'] = pd.to_datetime(df['datetime'])

# 按月份分组处理数据
for month, group in df.groupby(df['datetime'].dt.to_period('M')):
    print(f"月份: {month}")

    # 对当前月份的组合文本进行预处理
    processed_text = group['description'].fillna('') +' '+ group['title'].fillna('')
    processed_text = processed_text.apply(preprocess_text)

    # 将处理后的文本转换回字符串（用于 TfidfVectorizer）
    corpus = [' '.join(tokens) for tokens in processed_text]

    # 与死刑话题相关的词汇列表
    death_penalty_terms = ['death', 'penalty', 'capital', 'punishment', 'execute', 'abolish', 'convict','sentence']

    # 统计词频
    all_tokens = [token for sublist in processed_text for token in sublist]
    word_freq = Counter(all_tokens)

    # 筛选出与死刑话题相关的词频
    death_penalty_freq = {term: freq for term, freq in word_freq.items()
                          if any(t in term for t in death_penalty_terms)}

    # 按词频降序排序
    sorted_death_penalty_freq = sorted(death_penalty_freq.items(), key=lambda x: x[1], reverse=True)

    # 输出死刑相关词频结果
    print("与死刑话题相关的词频统计（降序）：")
    for term, freq in sorted_death_penalty_freq[:20]:  # 只显示前 20 个高频词
        print(f"{term}: {freq}")

    # 使用 TF - IDF 向量化
    vectorizer = TfidfVectorizer(
        max_df=0.95,  # 忽略在超过 95% 文档中出现的词
        min_df=2,  # 忽略在少于 2 篇文档中出现的词
        max_features=1000,  # 保留最高频的 1000 个词
        stop_words='english'  # 使用 sklearn 内置的英文停用词
    )

    tfidf_matrix = vectorizer.fit_transform(corpus)

    # 获取特征名称（词汇表）
    feature_names = vectorizer.get_feature_names_out()

    # 训练 LDA 模型进行主题建模
    lda = LatentDirichletAllocation(
        n_components=5,  # 主题数量
        max_iter=10,  # 最大迭代次数
        learning_method='online',
        random_state=42
    )

    lda_model = lda.fit(tfidf_matrix)

    # 输出每个主题的前 10 个关键词
    print("\n抽取的中心话题（每个主题前 10 个关键词）：")
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words_idx = topic.argsort()[-10:][::-1]  # 获取前 10 个关键词的索引
        top_words = [feature_names[i] for i in top_words_idx]
        print(f"Topic {topic_idx}: {', '.join(top_words)}")
    print("-" * 50)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 使用前面代码中的df和情感得分

# 定义关键事件
events = [
    {'month': '2017-07', 'event': '死刑执行案例增加', 'sentiment': 'positive'},
    {'month': '2017-08', 'event': '涉毒死刑案件讨论', 'sentiment': 'positive'},
    {'month': '2017-09', 'event': '争议点出现', 'sentiment': 'neutral'},
    {'month': '2017-10', 'event': '恐怖犯罪死刑案件', 'sentiment': 'positive'},
    {'month': '2017-10', 'event': '废除死刑讨论升温', 'sentiment': 'negative'}
]

# 绘制情感趋势图
plt.figure(figsize=(14, 7))
plt.plot(df.index, df['neutral_score'], 'o-', label='中性情感', color='gray', linewidth=2)
plt.plot(df.index, df['positive_score'], '^-', label='正向情感', color='green', linewidth=2)
plt.plot(df.index, df['negative_score'], 's-', label='负向情感', color='red', linewidth=2)

# 添加事件标记
for event in events:
    month_idx = list(df.index).index(event['month'])
    if event['sentiment'] == 'positive':
        score = df['positive_score'][month_idx]
        plt.annotate(event['event'], xy=(month_idx, score), xytext=(month_idx, score + 500),
                     arrowprops=dict(facecolor='green', shrink=0.05), color='green', fontweight='bold')
    elif event['sentiment'] == 'negative':
        score = df['negative_score'][month_idx]
        plt.annotate(event['event'], xy=(month_idx, score), xytext=(month_idx, score + 500),
                     arrowprops=dict(facecolor='red', shrink=0.05), color='red', fontweight='bold')
    else:
        score = df['neutral_score'][month_idx]
        plt.annotate(event['event'], xy=(month_idx, score), xytext=(month_idx, score + 500),
                     arrowprops=dict(facecolor='gray', shrink=0.05), color='gray', fontweight='bold')

plt.title('2017年6-10月死刑话题情感倾向与关键事件关联')
plt.xlabel('月份')
plt.ylabel('关键词词频总和')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()
plt.tight_layout()
plt.show()


# 美国死刑记录假设检验

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
import matplotlib

# 设置全局字体为宋体（SimSun）
matplotlib.rcParams['font.sans-serif'] = ['SimSun']  # 中文宋体
matplotlib.rcParams['axes.unicode_minus'] = False    # 解决负号显示问题

# 读取数据
file_path = "D:\\HuaweiMoveData\\Users\\32549\\OneDrive\\executions-to-2002 (2).csv"  # 替换为你的文件路径
df = pd.read_csv(file_path)

# 选取相关字段并清洗缺失值
df_clean = df[['race', 'SexOfOffender', 'STATE OF CONVICTION']].dropna()

# 清洗字段内容：提取括号后的真实标签（部分数据格式为 "(代码) Label"）
df_clean['race'] = df_clean['race'].str.extract(r'\)\s*(.*)$')[0].fillna(df_clean['race'])
df_clean['SexOfOffender'] = df_clean['SexOfOffender'].str.extract(r'\)\s*(.*)$')[0].fillna(df_clean['SexOfOffender'])
df_clean['STATE OF CONVICTION'] = df_clean['STATE OF CONVICTION'].str.extract(r'\)\s*(.*)$')[0].fillna(df_clean['STATE OF CONVICTION'])

# ====== 可视化每个变量的分布 ======
race_counts = df_clean['race'].value_counts()
sex_counts = df_clean['SexOfOffender'].value_counts()
state_counts = df_clean['STATE OF CONVICTION'].value_counts().head(10)
# 马卡龙配色
custom_colors = [
    "#a1c6ea",  # 柔蓝
    "#fcbad3",  # 烟粉
    "#ffdac1",  # 奶杏
    "#b5ead7",  # 薄荷绿
    "#c7ceea",  # 淡紫
    "#ffb5a7",  # 温柔珊瑚
    "#b8bedd",  # 烟蓝紫
    "#f7b7a3",  # 桃粉橘
]

# 可视化每个变量的分布
plt.figure(figsize=(20, 5))

plt.subplot(1, 3, 1)
sns.barplot(x=race_counts.values, y=race_counts.index, palette=custom_colors)
plt.title('种族分布')

plt.subplot(1, 3, 2)
sns.barplot(x=sex_counts.values, y=sex_counts.index, palette=custom_colors)
plt.title('性别分布')

plt.subplot(1, 3, 3)
sns.barplot(x=state_counts.values, y=state_counts.index, palette=custom_colors)
plt.title('定罪州前10名分布')

plt.tight_layout()
plt.show()

# ====== 卡方检验分析 ======

# 1. 种族与性别的关系
race_sex_table = pd.crosstab(df_clean['race'], df_clean['SexOfOffender'])
chi2_race_sex, p_race_sex, _, _ = chi2_contingency(race_sex_table)

# 2. 种族与定罪州的关系
race_state_table = pd.crosstab(df_clean['race'], df_clean['STATE OF CONVICTION'])
chi2_race_state, p_race_state, _, _ = chi2_contingency(race_state_table)

# 3. 性别与定罪州的关系
sex_state_table = pd.crosstab(df_clean['SexOfOffender'], df_clean['STATE OF CONVICTION'])
chi2_sex_state, p_sex_state, _, _ = chi2_contingency(sex_state_table)

# 输出结果
print("卡方检验 p 值：")
print(f"1. 种族 与 性别：{p_race_sex:.4f}")
print(f"2. 种族 与 定罪州：{p_race_state:.4e}")
print(f"3. 性别 与 定罪州：{p_sex_state:.4e}")


# 聚类分析——正负两类

In [None]:
import pandas as pd
import jieba
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from matplotlib.colors import ListedColormap
import matplotlib.font_manager as fm

# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False    # 用来正常显示负号


In [None]:


# 文件读取地址
file_path = r"D:\HuaweiMoveData\Users\32549\OneDrive\大二下\数据科学与数据分析\小组作业\死刑和遗言数据.csv"

# 加载数据
try:
    data = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding='gbk')

# 去除缺失值
data = data.dropna(subset=['last statement'])

# 加载停用词
with open("D:\\Users\\32549\\PycharmProjects\\数据科学聚类\\stopwords.txt", 'r', encoding='utf-8') as f:
    stopwords = [line.strip() for line in f.readlines()]

# 更完善的文本预处理函数
def preprocess_text(text):
    # 去除特殊字符
    text = re.sub(r'[^\w\s]', '', text)
    # 分词
    words = jieba.lcut(text)
    # 词性标注，保留形容词、副词等可能体现情感的词性
    import jieba.posseg as pseg
    words = [word for word, flag in pseg.lcut(text) if flag in ['a', 'ad', 'd'] or word not in stopwords]
    return " ".join(words)

# 对遗言进行预处理
data['last statement_preprocessed'] = data['last statement'].apply(preprocess_text)

# 使用 TF-IDF 进行文本向量化
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(data['last statement_preprocessed'])

# 增加情感分析特征
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

data['sentiment_score'] = data['last statement_preprocessed'].apply(get_sentiment)
X_sentiment = data['sentiment_score'].values.reshape(-1, 1)

# 合并特征
from scipy.sparse import hstack
X = hstack([X_tfidf, X_sentiment])

# 寻找最优聚类数
silhouette_scores = []
for n_cluster in range(2, 10):
    kmeans = KMeans(n_clusters=n_cluster, random_state=42)
    labels = kmeans.fit_predict(X)
    score = silhouette_score(X, labels)
    silhouette_scores.append(score)

best_n_cluster = silhouette_scores.index(max(silhouette_scores)) + 2

# 使用最优聚类数进行 K-Means 聚类
kmeans = KMeans(n_clusters=best_n_cluster, random_state=42)
data['cluster_label'] = kmeans.fit_predict(X)

kmeans = KMeans(n_clusters=best_n_cluster, random_state=42)
kmeans.fit(X)  # 训练全部数据
data['cluster_label'] = kmeans.labels_

# 分析每个聚类的特征，结合其他信息构建用户画像
for cluster in range(best_n_cluster):
    cluster_data = data[data['cluster_label'] == cluster]
    print(f"聚类 {cluster} 的用户画像：")
    print(f"平均年龄：{cluster_data['age'].mean():.2f}")
    print(f"主要种族：{cluster_data['race'].value_counts().idxmax()}")
    print(f"主要性别：{cluster_data['gender'].value_counts().idxmax()}")
    print(f"主要教育程度：{cluster_data['education level'].value_counts().idxmax()}")
    print(f"主要职业：{cluster_data['prior occupation'].value_counts().idxmax()}")
    print(f"有监狱记录比例：{cluster_data['prior prison record(0/1)'].mean():.2%}")
    print(f"平均情感得分：{cluster_data['sentiment_score'].mean():.2f}")
    print("典型遗言示例：")
    print(cluster_data['last statement'].iloc[0])
    print("-" * 50)

In [None]:
# 添加模型性能

import pandas as pd
import re
import jieba
import jieba.posseg as pseg
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack

# 文件读取地址
file_path = r"D:\HuaweiMoveData\Users\32549\OneDrive\大二下\数据科学与数据分析\小组作业\死刑和遗言数据.csv"

# 加载数据
try:
    data = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding='gbk')

# 去除缺失值
data = data.dropna(subset=['last statement'])

# 加载停用词
with open("D:\\Users\\32549\\PycharmProjects\\数据科学聚类\\stopwords.txt", 'r', encoding='utf-8') as f:
    stopwords = [line.strip() for line in f.readlines()]

# 更完善的文本预处理函数
def preprocess_text(text):
    # 去除特殊字符
    text = re.sub(r'[^\w\s]', '', text)
    # 词性标注，保留形容词、副词等可能体现情感的词性
    words = [word for word, flag in pseg.lcut(text) if flag in ['a', 'ad', 'd'] or word not in stopwords]
    return " ".join(words)

# 对遗言进行预处理
data['last statement_preprocessed'] = data['last statement'].apply(preprocess_text)

# 使用 TF-IDF 进行文本向量化
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(data['last statement_preprocessed'])

# 增加情感分析特征
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

data['sentiment_score'] = data['last statement_preprocessed'].apply(get_sentiment)
X_sentiment = data['sentiment_score'].values.reshape(-1, 1)

# 合并特征
X = hstack([X_tfidf, X_sentiment])

# 寻找最优聚类数
silhouette_scores = []
for n_cluster in range(2, 10):
    kmeans = KMeans(n_clusters=n_cluster, random_state=42)
    labels = kmeans.fit_predict(X)
    score = silhouette_score(X, labels)
    silhouette_scores.append(score)

best_n_cluster = silhouette_scores.index(max(silhouette_scores)) + 2
print(f"最优聚类数: {best_n_cluster}")

# 使用最优聚类数进行 K-Means 聚类（对全部数据）
kmeans = KMeans(n_clusters=best_n_cluster, random_state=42)
kmeans.fit(X)
data['cluster_label'] = kmeans.labels_

# --- 新增：训练集/测试集划分及性能输出 ---
X_train, X_test = train_test_split(X, test_size=0.3, random_state=42)

kmeans_train_test = KMeans(n_clusters=best_n_cluster, random_state=42)
kmeans_train_test.fit(X_train)

train_labels = kmeans_train_test.labels_
test_labels = kmeans_train_test.predict(X_test)

train_silhouette = silhouette_score(X_train, train_labels)
test_silhouette = silhouette_score(X_test, test_labels)

print(f"训练集轮廓系数: {train_silhouette:.4f}")
print(f"测试集轮廓系数: {test_silhouette:.4f}")

# 分析每个聚类的特征，结合其他信息构建用户画像
for cluster in range(best_n_cluster):
    cluster_data = data[data['cluster_label'] == cluster]
    print(f"聚类 {cluster} 的用户画像：")
    print(f"平均年龄：{cluster_data['age'].mean():.2f}")
    print(f"主要种族：{cluster_data['race'].value_counts().idxmax()}")
    print(f"主要性别：{cluster_data['gender'].value_counts().idxmax()}")
    print(f"主要教育程度：{cluster_data['education level'].value_counts().idxmax()}")
    print(f"主要职业：{cluster_data['prior occupation'].value_counts().idxmax()}")
    print(f"有监狱记录比例：{cluster_data['prior prison record(0/1)'].mean():.2%}")
    print(f"平均情感得分：{cluster_data['sentiment_score'].mean():.2f}")
    print("典型遗言示例：")
    print(cluster_data['last statement'].iloc[0])
    print("-" * 50)


In [None]:
# 可视化
# 可视化1: 聚类结果的二维可视化 (PCA降维)
def plot_clusters_pca(X, labels, n_clusters):
    # 使用PCA降维到2D
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X.toarray() if hasattr(X, 'toarray') else X)
    
    # 创建散点图
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, 
                          cmap='viridis', alpha=0.7, s=50)
    
    # 添加图例
    legend = plt.legend(*scatter.legend_elements(),
                       title="聚类",
                       loc="upper right")
    plt.gca().add_artist(legend)
    
    # 添加聚类中心
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X)
    centers_pca = pca.transform(kmeans.cluster_centers_)
    plt.scatter(centers_pca[:, 0], centers_pca[:, 1], 
                c='red', marker='X', s=200, label='聚类中心')
   
    plt.title(f'K-means聚类结果 (PCA降维)')
    plt.xlabel('主成分1')
    plt.ylabel('主成分2')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('clusters_pca.png', dpi=300, bbox_inches='tight')
    plt.show()

# 可视化2: 聚类中心词云
def plot_cluster_wordclouds(data, n_clusters, vectorizer):
    # 获取特征名称
    feature_names = vectorizer.get_feature_names_out()
    
    # 为每个聚类生成词云
    fig, axes = plt.subplots(1, n_clusters, figsize=(5*n_clusters, 5))
    if n_clusters == 1:
        axes = [axes]
    
    for cluster in range(n_clusters):
        # 获取该聚类的TF-IDF权重
        cluster_indices = np.where(data['cluster_label'] == cluster)[0]
        cluster_tfidf = X_tfidf[cluster_indices].toarray().sum(axis=0)
        
        # 创建词-权重字典
        word_weights = {feature_names[i]: cluster_tfidf[i] 
                       for i in range(len(feature_names)) 
                       if cluster_tfidf[i] > 0}
        
        # 生成词云
        wordcloud = WordCloud(width=400, height=400, 
                             background_color='white',
                             max_words=100,
                             contour_width=3,
                             contour_color='steelblue')
        wordcloud.generate_from_frequencies(word_weights)
        
        # 显示词云
        axes[cluster].imshow(wordcloud, interpolation='bilinear')
        axes[cluster].set_title(f'聚类 {cluster} 的关键词')
        axes[cluster].axis('off')
    
    plt.tight_layout()
    plt.savefig('cluster_wordclouds.png', dpi=300, bbox_inches='tight')
    plt.show()



# 可视化3: 情感分析可视化
def plot_sentiment_analysis(data, n_clusters):
    plt.figure(figsize=(12, 6))
    
    # 箱线图
    sns.boxplot(x='cluster_label', y='sentiment_score', data=data, palette='viridis')
    plt.title('各聚类的情感得分分布')
    plt.xlabel('聚类')
    plt.ylabel('情感得分')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('sentiment_boxplot.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # 直方图
    plt.figure(figsize=(14, 6))
    for cluster in range(n_clusters):
        sns.histplot(data[data['cluster_label'] == cluster]['sentiment_score'], 
                    kde=True, label=f'聚类 {cluster}', alpha=0.5)
    plt.title('各聚类的情感得分分布直方图')
    plt.xlabel('情感得分')
    plt.ylabel('频率')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('sentiment_histogram.png', dpi=300, bbox_inches='tight')
    plt.show()

# 执行可视化
plot_clusters_pca(X, data['cluster_label'], best_n_cluster)
plot_cluster_wordclouds(data, best_n_cluster, vectorizer)
plot_cluster_statistics(data, best_n_cluster)
plot_sentiment_analysis(data, best_n_cluster)


# 聚类分析1——使用bert模型正负两类 更优

正在使用BERT进行文本向量化...


100%|██████████| 29/29 [02:12<00:00,  4.58s/it]


聚类数 2, 轮廓系数: 0.1700
聚类数 3, 轮廓系数: 0.1376
聚类数 4, 轮廓系数: 0.1046
聚类数 5, 轮廓系数: 0.1037
聚类数 6, 轮廓系数: 0.0732
聚类数 7, 轮廓系数: 0.0684
聚类数 8, 轮廓系数: 0.0907
聚类数 9, 轮廓系数: 0.0886
最优聚类数: 2

聚类 0 的用户画像：
样本数量：229
平均年龄：39.28
主要种族：White
主要性别：male
主要教育程度：11
主要职业：laborer
有监狱记录比例：53.28%
平均情感得分：0.20
代表性关键词： family, im, know, like, love, sorry, thank, want, would, yes
典型遗言示例：
  1. Yes Warden, I would like to tell the family of the victim that I could never figure out the words to...
  2. Yes ma’am, I want to thank y’all. I love y’all for supporting me. I want to apologize for the wrong ...
  3. Yes, I just want to thank (pause) I don’t want to leave you baby, see you when you get there. I love...
--------------------------------------------------------------------------------

聚类 1 的用户画像：
样本数量：224
平均年龄：40.00
主要种族：White
主要性别：male
主要教育程度：12
主要职业：laborer
有监狱记录比例：49.55%
平均情感得分：0.16
代表性关键词： family, god, know, like, love, sorry, thank, want, would, yall
典型遗言示例：
  1. There is not a day that goes by that I don't regret my

# 聚类分析——正向分析

In [None]:
import pandas as pd
import jieba
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
import numpy as np

# 文件读取地址
file_path = r"D:\HuaweiMoveData\Users\32549\OneDrive\大二下\数据科学与数据分析\小组作业\死刑和遗言数据.csv"

# 加载数据
try:
    data = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding='gbk')

# 去除缺失值
data = data.dropna(subset=['last statement'])

# 文本预处理函数
def preprocess_text(text):
    # 去除特殊字符
    text = re.sub(r'[^\w\s]', '', text)
    # 分词
    words = jieba.lcut(text)
    # 假设没有停用词表，这里简单去除单个字符的词
    words = [word for word in words if len(word) > 1]
    return " ".join(words)

# 对遗言进行预处理
data['last statement_preprocessed'] = data['last statement'].apply(preprocess_text)

# TF-IDF 向量化
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['last statement_preprocessed'])

# 划分训练集和测试集（用于验证聚类性能）
X_train, X_test, data_train, data_test = train_test_split(X, data, test_size=0.3, random_state=42)

# 聚类数（可根据实际情况调优）
num_clusters = 5

# 训练集上训练 K-Means
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
train_labels = kmeans.fit_predict(X_train)

# 测试集聚类标签（预测）
test_labels = kmeans.predict(X_test)

# 计算轮廓系数
train_silhouette = silhouette_score(X_train, train_labels)
test_silhouette = silhouette_score(X_test, test_labels)

print(f"训练集轮廓系数: {train_silhouette:.4f}")
print(f"测试集轮廓系数: {test_silhouette:.4f}")

# 对所有数据重新打标签用于聚类画像分析
data['cluster_label'] = kmeans.predict(X)

# 分析每个聚类的特征，结合其他信息构建用户画像
for cluster in range(num_clusters):
    cluster_data = data[data['cluster_label'] == cluster]
    print(f"\n聚类 {cluster} 的用户画像：")
    print(f"平均年龄：{cluster_data['age'].mean():.2f}")
    print(f"主要种族：{cluster_data['race'].value_counts().idxmax()}")
    print(f"主要性别：{cluster_data['gender'].value_counts().idxmax()}")
    print(f"主要教育程度：{cluster_data['education level'].value_counts().idxmax()}")
    print(f"主要职业：{cluster_data['prior occupation'].value_counts().idxmax()}")
    print(f"有监狱记录比例：{cluster_data['prior prison record(0/1)'].mean():.2%}")
    print("典型遗言示例：")
    print(cluster_data['last statement'].iloc[0])
    print("-" * 50)


In [None]:
# 可视化1: 聚类结果的二维可视化 (PCA降维)
def plot_clusters_pca(X, labels, n_clusters):
    # 使用PCA降维到2D
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X.toarray())
    
    # 创建散点图
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, 
                          cmap='viridis', alpha=0.7, s=50)
    
    # 添加图例
    legend = plt.legend(*scatter.legend_elements(),
                       title="聚类",
                       loc="upper right")
    plt.gca().add_artist(legend)
    
    # 添加聚类中心
    centers_pca = pca.transform(kmeans.cluster_centers_)
    plt.scatter(centers_pca[:, 0], centers_pca[:, 1], 
                c='red', marker='X', s=200, label='聚类中心')
  
    plt.title(f'K-means聚类结果 (PCA降维)')
    plt.xlabel('主成分1')
    plt.ylabel('主成分2')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('clusters_pca.png', dpi=300, bbox_inches='tight')
    plt.show()

# 可视化2: 聚类中心词云
def plot_cluster_wordclouds(data, n_clusters, vectorizer):
    # 获取特征名称
    feature_names = vectorizer.get_feature_names_out()
    
    # 为每个聚类生成词云
    fig, axes = plt.subplots(1, n_clusters, figsize=(5*n_clusters, 5))
    if n_clusters == 1:
        axes = [axes]
    
    for cluster in range(n_clusters):
        # 获取该聚类的TF-IDF权重
        cluster_indices = np.where(data['cluster_label'] == cluster)[0]
        cluster_tfidf = X[cluster_indices].toarray().sum(axis=0)
        
        # 创建词-权重字典
        word_weights = {feature_names[i]: cluster_tfidf[i] 
                       for i in range(len(feature_names)) 
                       if cluster_tfidf[i] > 0}
        
        # 生成词云
        wordcloud = WordCloud(width=400, height=400, 
                             background_color='white',
                             max_words=100,
                             contour_width=3,
                             contour_color='steelblue')
        wordcloud.generate_from_frequencies(word_weights)
        
        # 显示词云
        axes[cluster].imshow(wordcloud, interpolation='bilinear')
        axes[cluster].set_title(f'聚类 {cluster} 的关键词')
        axes[cluster].axis('off')
    
    plt.tight_layout()
    plt.savefig('cluster_wordclouds.png', dpi=300, bbox_inches='tight')
    plt.show()

# 可视化3: 聚类统计特征对比
def plot_cluster_statistics(data, n_clusters):
    # 准备数据
    stats = {}
    numeric_cols = ['age', 'prior prison record(0/1)']
    categorical_cols = ['race', 'gender', 'education level', 'prior occupation']
    
    for col in numeric_cols:
        stats[col] = [data[data['cluster_label'] == cluster][col].mean() 
                     for cluster in range(n_clusters)]
    
    # 创建雷达图
    categories = list(stats.keys())
    N = len(categories)
    
    # 创建角度
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]  # 闭合雷达图
    
    # 创建图表
    fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))
    
    # 绘制每个聚类的雷达图
    for cluster in range(n_clusters):
        values = [stats[col][cluster] for col in categories]
        values += values[:1]  # 闭合雷达图
        ax.plot(angles, values, linewidth=2, linestyle='solid', label=f'聚类 {cluster}')
        ax.fill(angles, values, alpha=0.1)
    
    # 设置坐标轴
    ax.set_thetagrids(np.degrees(angles[:-1]), categories)
    ax.set_title('各聚类的统计特征对比', size=15, y=1.1)
    ax.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    
    plt.savefig('cluster_statistics_radar.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # 为分类特征创建柱状图
    for col in categorical_cols:
        plt.figure(figsize=(12, 6))
        sns.countplot(x=col, hue='cluster_label', data=data, palette='viridis')
        plt.title(f'不同聚类的{col}分布')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f'cluster_{col}_distribution.png', dpi=300, bbox_inches='tight')
        plt.show()

# 执行可视化
plot_clusters_pca(X, data['cluster_label'], num_clusters)
plot_cluster_wordclouds(data, num_clusters, vectorizer)
plot_cluster_statistics(data, num_clusters)


In [6]:
import pandas as pd
import jieba
import re
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from transformers import RobertaTokenizer, RobertaModel
import torch
import numpy as np
from snownlp import SnowNLP  # 提前导入SnowNLP

# 文件读取地址
file_path = r"D:\HuaweiMoveData\Users\32549\OneDrive\大二下\数据科学与数据分析\小组作业\死刑和遗言数据.csv"

# 加载数据
try:
    data = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding='gbk')

# 去除缺失值
data = data.dropna(subset=['last statement'])

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    words = jieba.lcut(text)
    words = [word for word in words if len(word) > 1]  # 保留双字词以上
    return " ".join(words)

data['last statement_preprocessed'] = data['last statement'].apply(preprocess_text)

# ----------------------- BERT向量化 -----------------------
# 加载优化版中文RoBERTa模型（全词掩码+更多训练数据）
tokenizer = RobertaTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
model = RobertaModel.from_pretrained("hfl/chinese-roberta-wwm-ext")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 批量获取句子嵌入（使用CLS token）
def get_bert_embeddings(texts, batch_size=16):  # 减小batch_size
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            encoded = tokenizer(
                batch, padding=True, truncation=True, max_length=512, return_tensors="pt"
            ).to(device)
            outputs = model(**encoded)
            cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # 取CLS向量
            embeddings.extend(cls_embedding)
    return np.array(embeddings)

# 生成BERT特征矩阵
print("生成RoBERTa特征...")
X_bert = get_bert_embeddings(data['last statement_preprocessed'].tolist())

# ----------------------- 聚类分析 -----------------------
# 划分训练集/测试集（仅用于评估聚类稳定性）
X_train, X_test, data_train, data_test = train_test_split(
    X_bert, data, test_size=0.3, random_state=42
)

# 辅助函数：中文情感分析（提前定义）
def get_sentiment_mean(cluster_data):
    sentiments = [SnowNLP(text).sentiments for text in cluster_data['last statement_preprocessed']]
    return np.mean(sentiments).round(2)

# 寻找最优聚类数（轮廓系数法）
silhouette_scores = []
for n_cluster in range(2, 10):
    kmeans = KMeans(n_clusters=n_cluster, random_state=42, n_init=10)
    train_labels = kmeans.fit_predict(X_train)
    test_labels = kmeans.predict(X_test)
    train_score = silhouette_score(X_train, train_labels)
    test_score = silhouette_score(X_test, test_labels)
    silhouette_scores.append((n_cluster, train_score, test_score))
    print(f"聚类数{n_cluster}: 训练集得分{train_score:.4f}, 测试集得分{test_score:.4f}")

# 选择训练集+测试集得分最均衡的聚类数
best_n_cluster = max(silhouette_scores, key=lambda x: (x[1]+x[2])/2)[0]
print(f"\n最优聚类数: {best_n_cluster}")

# 重新训练全量数据
kmeans = KMeans(n_clusters=best_n_cluster, random_state=42, n_init=10)
data['cluster_label'] = kmeans.fit_predict(X_bert)

# ----------------------- 聚类画像分析 -----------------------
for cluster in range(best_n_cluster):
    cluster_data = data[data['cluster_label'] == cluster]
    print(f"\n聚类 {cluster} 分析（样本量:{len(cluster_data)}）:")
    print(f"  平均年龄: {cluster_data['age'].mean():.1f} 岁")
    print(f"  主要种族: {cluster_data['race'].value_counts().idxmax()}")
    gender_dist = cluster_data['gender'].value_counts(normalize=True).mul(100).round(1)
    print(f"  性别分布: {gender_dist.to_dict()}%")  # 修正输出格式
    print(f"  教育程度: {cluster_data['education level'].value_counts().idxmax()}")
    print(f"  犯罪特征: 监狱记录比例{cluster_data['prior prison record(0/1)'].mean():.1%}")
    print(f"  情感倾向: 平均长度{cluster_data['last statement_preprocessed'].str.len().mean():.1f}字 / 平均情感得分{get_sentiment_mean(cluster_data)}")
    
    # 提取高频关键词（TF-IDF top10）
    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf = TfidfVectorizer(max_features=10)
    try:
        tfidf_matrix = tfidf.fit_transform(cluster_data['last statement_preprocessed'])
        keywords = tfidf.get_feature_names_out()
        print(f"  核心关键词: {', '.join(keywords)}")
    except Exception as e:
        print(f"  关键词提取失败: {e}")
    
    # 典型遗言示例
    print("  典型遗言:")
    for text in cluster_data['last statement'].head(2):
        print(f"    - {text[:80]}...")
    print("-"*80)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


TypeError: expected str, bytes or os.PathLike object, not NoneType