In [None]:
import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
import re
import seaborn as sns
from scipy.stats import spearmanr
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu

In [None]:
print(plt.style.available)

In [None]:
#indicator 文件提取
desktop_path = os.path.join(os.path.expanduser('~'), 'Desktop')
directory_path = os.path.join(desktop_path, 'human_project_data','indicator_csv')
file_path1 = os.path.join(directory_path, 'p1 food item answer indicator.csv')
file_path2 = os.path.join(directory_path, 'p2 food item answer indicator.csv')
file_path3 = os.path.join(directory_path, 'p2only food item answer indicator.csv')
file_path4 = os.path.join(directory_path, 'p2 food dim answer indicator.csv')
file_path5 = os.path.join(directory_path, 'p2only food dim answer indicator.csv')

df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)
df3 = pd.read_csv(file_path3)
df4 = pd.read_csv(file_path4)
df5 = pd.read_csv(file_path5)

In [None]:
#identified number 提琴散点图 
sns.reset_defaults()

group1 = df1['Identified number'].to_numpy()  # 第一组数据
group2 = df2['Identified number'].to_numpy()  # 第二组数据
group3 = df3['Identified number'].to_numpy()  # 第三组数据

df = pd.DataFrame({
    "Group": ["p1p2-p1"] * len(group1) + ["p1p2-p2"] * len(group2) + ["p2-only"] * len(group3),
    "Value": np.concatenate([group1, group2, group3])
})

# 比较组 1 和 2
stat, p_value_1_2 = mannwhitneyu(group1, group2)
print(f'Mann-Whitney U test between group1 and group2: p-value = {p_value_1_2}')
# 比较组 1 和 3
stat, p_value_1_3 = mannwhitneyu(group1, group3)
print(f'Mann-Whitney U test between group1 and group3: p-value = {p_value_1_3}')
# 比较组 2 和 3
stat, p_value_2_3 = mannwhitneyu(group2, group3)
print(f'Mann-Whitney U test between group2 and group3: p-value = {p_value_2_3}')

# 创建小提琴图
plt.figure(figsize=(10, 6))
sns.violinplot(x="Group", y="Value", data=df, inner=None, color = 'grey', linewidth=1.5,alpha=0.3)
sns.stripplot(x="Group", y="Value", data=df, jitter=True, size=5, color="blue", alpha=0.6)

# 计算并绘制统计信息
for group in df["Group"].unique():
    group_values = df[df["Group"] == group]["Value"]
    median = np.median(group_values)
    mean = np.mean(group_values)
    q1 = np.percentile(group_values, 25)
    q3 = np.percentile(group_values, 75)
    
    # 获取 x 坐标
    x_pos = list(df["Group"].unique()).index(group)
    
    # 添加统计信息到图中
    plt.scatter(x_pos, median, color="red", label="Median" if group == df["Group"].unique()[0] else "")
    plt.scatter(x_pos, mean, color="green", label="Mean" if group == df["Group"].unique()[0] else "")
    plt.scatter(x_pos, q1, color="purple", label="Q1 (25th Percentile)" if group == df["Group"].unique()[0] else "")
    plt.scatter(x_pos, q3, color="orange", label="Q3 (75th Percentile)" if group == df["Group"].unique()[0] else "")

# 添加图例
plt.legend()
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
plt.title("Violin Plot of identified number of food item", fontsize=16)
plt.xlabel("Group", fontsize=14)
plt.ylabel("Counts", fontsize=14)
plt.show()

# food dim 的图
group4 = df4['Identified number'].to_numpy()
group5 = df5['Identified number'].to_numpy() 

df = pd.DataFrame({
    "Group": ["p1p2-p2"] * len(group4) + ["p2-only"] * len(group5),
    "Value": np.concatenate([group4, group5])
})

# 比较组 4 和 5
stat, p_value_4_5 = mannwhitneyu(group4, group5)
print(f'Mann-Whitney U test between group1 and group2: p-value = {p_value_4_5}')

plt.figure(figsize=(10, 6))
sns.violinplot(x="Group", y="Value", data=df, inner=None, color = 'grey', linewidth=1.5,alpha=0.3)
sns.stripplot(x="Group", y="Value", data=df, jitter=True, size=5, color="blue", alpha=0.6)
for group in df["Group"].unique():
    group_values = df[df["Group"] == group]["Value"]
    median = np.median(group_values)
    mean = np.mean(group_values)
    q1 = np.percentile(group_values, 25)
    q3 = np.percentile(group_values, 75)
    
    x_pos = list(df["Group"].unique()).index(group)

    plt.scatter(x_pos, median, color="red", label="Median" if group == df["Group"].unique()[0] else "")
    plt.scatter(x_pos, mean, color="green", label="Mean" if group == df["Group"].unique()[0] else "")
    plt.scatter(x_pos, q1, color="purple", label="Q1 (25th Percentile)" if group == df["Group"].unique()[0] else "")
    plt.scatter(x_pos, q3, color="orange", label="Q3 (75th Percentile)" if group == df["Group"].unique()[0] else "")

plt.legend()
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
plt.title("Violin Plot of identified number of food dimension", fontsize=16)
plt.xlabel("Group", fontsize=14)
plt.ylabel("Counts", fontsize=14)
plt.show()


In [None]:
#identified number with score 分布图
from scipy.stats import pearsonr

x = df2['Identified number'].squeeze()
y = df2['Score'].squeeze()
corr, p_value = spearmanr(x, y)#计算斯皮尔曼相关系数
data = pd.DataFrame({"x": x, "y": y})
data["count"] = data.groupby(["x", "y"])["x"].transform("count")

fig, ax = plt.subplots(figsize=(5,5))
sns.scatterplot(x="x", y="y", size="count", sizes=(20, 200), hue="count", palette="viridis", data=data, ax=ax, legend="brief")
ax.legend(
    title="Count",          # 图例标题
    bbox_to_anchor=(1.05, 1),  # 图例框位置 (x, y)
    loc="upper left",       # 图例锚点
    borderaxespad=0.        # 图例与轴之间的间距
)
sns.regplot(x=x, y=y, ax=ax, scatter=False, line_kws={'color': 'grey'})
ax.set_title(f'scatter plot identified number of food item with score for p2', fontsize = 15)
ax.set_xlabel( "Identified number ",fontsize = 14)
ax.set_ylabel('The highest score of subject',fontsize = 14)
ax.text(0.1,0.99, f'Pearson corr: {corr:.2f},P-value:{p_value:.2f}',fontsize = 13, color = 'grey', ha='left', va='top',transform=ax.transAxes )
ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
plt.show()



In [None]:
# 求每个组的平均false positive rate， 并做统计检验
sns.set_theme()

fpr1 = df1['False positive rate']
fpr2 = df2['False positive rate']
fpr3 = df3['False positive rate']
fpr1 = fpr1.to_frame()
fpr2 = fpr2.to_frame()
fpr3 = fpr3.to_frame()
# 整合数据
fpr1['group'] = 'p1p2-p1'
fpr2['group'] = 'p1p2-p2'
fpr3['group'] = 'p2-only'
data = pd.concat([fpr1, fpr2, fpr3])
data = data.dropna(subset=['False positive rate'])
mean_values = data.groupby('group')['False positive rate'].mean()

plt.figure(figsize=(8,6))
sns.barplot(data=data, x='group', y='False positive rate', ci=None, alpha=0.6, edgecolor='black')
sns.stripplot(data=data, x='group', y='False positive rate', color='lightcoral', alpha=0.7, jitter=True)

groups = data['group'].unique()
comparisons = [(groups[i], groups[j]) for i in range(len(groups)) for j in range(i+1, len(groups))]
# y_max = data['False positive rate'].max()

for i, (g1, g2) in enumerate(comparisons):
    group1 = data[data['group'] == g1]['False positive rate']
    group2 = data[data['group'] == g2]['False positive rate']
    stat, p_value = ttest_ind(group1, group2)
    print(f'{groups[i]}_p_value:{p_value}')
    
    # 显著性标注
    # x1, x2 = i, i + 1  # 柱状图 x 坐标
    # y, h, col = y_max + 1 + i * 0.2, 0.2, 'k'  # y 起点、间隔和颜色
    # plt.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.5, color=col)
    # p_text = "p < 0.05" if p_value < 0.05 else "ns"
    # plt.text((x1 + x2) * 0.5, y + h, p_text, ha='center', va='bottom', color=col)

# 设置图表样式
plt.title('Accuracy for food item identification',fontsize = 16)
plt.ylim(0,1)
plt.ylabel('value',fontsize = 14)
plt.xlabel('Group', fontsize = 14)
plt.tight_layout()

plt.show()

fpr4 = df4['False positive rate']
fpr5 = df5['False positive rate']
fpr4 = fpr4.to_frame()
fpr5 = fpr5.to_frame()
# 整合数据
fpr4['group'] = 'p1p2-p2'
fpr5['group'] = 'p2-only'
data = pd.concat([fpr4, fpr5])
print(data.columns) 

mean_values = data.groupby('group')['False positive rate'].mean()

plt.figure(figsize=(8,6))
sns.barplot(data=data, x='group', y='False positive rate', ci=None, alpha=0.6, edgecolor='black')
sns.stripplot(data=data, x='group', y='False positive rate', color='lightcoral', alpha=0.7, jitter=True)

groups = data['group'].unique()
comparisons = [(groups[i], groups[j]) for i in range(len(groups)) for j in range(i+1, len(groups))]
y_max = data['False positive rate'].max()

for i, (g1, g2) in enumerate(comparisons):
    group1 = data[data['group'] == g1]['False positive rate']
    group2 = data[data['group'] == g2]['False positive rate']
    stat, p_value = ttest_ind(group1, group2)
    print(f'{groups[i]}_p_value:{p_value}')
    
    # 显著性标注
    # x1, x2 = i, i + 1  # 柱状图 x 坐标
    # y, h, col = y_max + 1 + i * 0.2, 0.2, 'k'  # y 起点、间隔和颜色
    # plt.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.5, color=col)
    # p_text = "p < 0.05" if p_value < 0.05 else "ns"
    # plt.text((x1 + x2) * 0.5, y + h, p_text, ha='center', va='bottom', color=col)

# 设置图表样式
plt.title('False positive rate for food dimension identification',fontsize = 16)
plt.ylabel('value',fontsize = 14)
plt.xlabel('Group', fontsize = 14)
plt.tight_layout()

plt.show()


In [None]:
#false positive rate with score

from scipy.stats import pearsonr
sns.reset_defaults()

x = df3['False positive rate'].squeeze()
y = df3['Score'].squeeze()
corr, p_value = spearmanr(x, y)#计算斯皮尔曼相关系数
data = pd.DataFrame({"x": x, "y": y})
data["count"] = data.groupby(["x", "y"])["x"].transform("count")

fig, ax = plt.subplots(figsize=(5,5))
sns.scatterplot(x="x", y="y", size="count", sizes=(20, 200), hue="count", palette="viridis", data=data, ax=ax, legend="brief")
ax.legend(
    title="Count",          # 图例标题
    bbox_to_anchor=(1.05, 1),  # 图例框位置 (x, y)
    loc="upper left",       # 图例锚点
    borderaxespad=0.        # 图例与轴之间的间距
)
sns.regplot(x=x, y=y, ax=ax, scatter=False, line_kws={'color': 'grey'})
ax.set_title(f'False positive rate food item identification with score for p2only', fontsize = 15)
ax.set_xlabel( "False positive rate ",fontsize = 14)
ax.set_ylabel('The highest score of subject',fontsize = 14)
ax.text(0.1,0.99, f'Pearson corr: {corr:.2f},P-value:{p_value:.2f}',fontsize = 13, color = 'grey', ha='left', va='top',transform=ax.transAxes )
ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
plt.show()

In [None]:
# correct answer 和 full score的关系

