In [3]:
import pandas as pd
import numpy as np
import os

# 设置随机种子以获得可重复的结果
np.random.seed(0)

# 读取CSV文件
df = pd.read_csv('D:/yanyi/PCNtoolkit-demo-main/data/BLR60/Zsumpos.csv', header=None)  # 替换为实际文件路径

# 循环三次执行随机化过程并保存新文件
for i in range(10000):
    # 生成45个'HC'和106个'SLE'标签
    hc_tags = np.repeat('HC', 63)
    sle_tags = np.repeat('SLE', 124)
    all_tags = np.concatenate((hc_tags, sle_tags))

    # 打乱标签顺序
    np.random.shuffle(all_tags)

    # 将打乱后的标签作为新列添加到DataFrame中
    df['Tag'] = all_tags

    # 指定新的文件保存路径
    new_file_path = 'D:\\yanyi\\perm\\60BLRpos\\your_modified_file_{}.csv'.format(i+1)

    # 确保目录存在，如果不存在则创建
    os.makedirs(os.path.dirname(new_file_path), exist_ok=True)

    # 保存修改后的DataFrame到新的CSV文件
    df.to_csv(new_file_path, index=False, header=False)

    # 移除当前的'Tag'列，以便下一次循环可以重新添加新的随机标签
    df.drop(columns=['Tag'], inplace=True)

In [4]:

import pandas as pd
import os
import glob

# 指定原始文件夹和结果文件夹路径
source_folder = 'D:/yanyi/perm/60BLRpos'
destination_folder = 'D:/yanyi/perm/60BLRpos2'

# 如果目标文件夹不存在，则创建它
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# 获取原始文件夹中所有的CSV文件路径
csv_files = glob.glob(os.path.join(source_folder, '*.csv'))

# 遍历所有CSV文件
for file_path in csv_files:
    # 读取CSV文件
    df = pd.read_csv(file_path, header=None)
    
    # 计算最后一列是"SLE"的行的平均值
    sle_averages = df[df.iloc[:, -1] == 'SLE'].iloc[:, :-1].mean(axis=0, skipna=True)
    
    # 计算最后一列是"HC"的行的平均值
    hc_averages = df[df.iloc[:, -1] == 'HC'].iloc[:, :-1].mean(axis=0, skipna=True)
    
    # 创建一个新的DataFrame来保存平均值
    averages_df = pd.DataFrame({
        'SLE Averages': sle_averages,
        'HC Averages': hc_averages
    })
    
    # 将平均值作为新列添加到原始DataFrame的旁边
    final_df = pd.concat([df, averages_df], axis=1)
    
    # 构造新的文件名和路径
    new_file_name = os.path.basename(file_path)
    new_file_path = os.path.join(destination_folder, new_file_name)
    
    # 保存修改后的DataFrame到新的CSV文件
    final_df.to_csv(new_file_path, index=False)

In [5]:
pip install glob2

Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
import glob
import os

# 指定文件夹路径
folder_path = 'D:/yanyi/perm/60BLRpos2'

# 获取文件夹下所有的CSV文件路径
file_paths = glob.glob(os.path.join(folder_path, '*.csv'))

# 初始化一个空的DataFrame来存储合并后的数据
combined_df = pd.DataFrame()

# 遍历所有文件路径
for file_path in file_paths:
    # 读取当前文件
    df = pd.read_csv(file_path, header=None)
    
    # 选择第149列，索引为148
    sle_averages = df.iloc[:, 149]  # 使用.iloc来选择特定的列
    
    # 将当前文件的'SLE Averages'列添加到combined_df中
    # 使用pd.concat来水平方向上合并，axis=1表示按列合并
    combined_df = pd.concat([combined_df, pd.DataFrame(sle_averages)], axis=1)

# 保存合并后的DataFrame到CSV文件
combined_df.to_csv('D:/yanyi/perm/60BLRresultpos/combined_sle_average.csv', index=False, header=False)

In [9]:
import pandas as pd
import numpy as np

# 假设CSV文件的路径分别是 'file1.csv' 和 'file2.csv'
file_path1 = 'D:/yanyi/perm/60BLRresultpos/combined_hc_average.csv'
file_path2 = 'D:/yanyi/perm/60BLRresultpos/combined_sle_average.csv'

# 读取CSV文件，不将任何行作为列头
data1 = pd.read_csv(file_path1, header=None)
data2 = pd.read_csv(file_path2, header=None)

# 确保两个CSV文件具有相同数量的行
if len(data1) != len(data2):
    raise ValueError("两个CSV文件的行数必须相同。")

num_rows = len(data1)  # 文件的行数

# 初始化一个空的DataFrame，用于存储结果
# 结果DataFrame将有10000行和与data1相同列数的列
results_df = pd.DataFrame(np.zeros((10000, num_rows)), dtype=float)

# 对每一行数据执行随机减法操作
for i in range(num_rows):
    row1 = data1.iloc[i]
    row2 = data2.iloc[i]
    
    # 进行10000次随机选择和减法操作
    results = []
    for _ in range(10000):
        idx1 = np.random.randint(0, len(row1))
        idx2 = np.random.randint(0, len(row2))
        result = row2.iloc[idx1] - row1.iloc[idx2]
        results.append(result)
    # 将当前行的结果作为一个新的列添加到DataFrame中
    results_df.iloc[:, i] = results

# 将结果DataFrame写入新的 CSV 文件，不包含行标题和列标题
output_csv_path = 'D:/yanyi/perm/resultdelta/random_diffs60HCBLRpos.csv'
results_df.to_csv(output_csv_path, index=False, header=False)

print(f"10000行 x {num_rows}列的随机减法结果已保存至 '{output_csv_path}' 文件。")

10000行 x 148列的随机减法结果已保存至 'D:/yanyi/perm/resultdelta/random_diffs60HCBLRpos.csv' 文件。


In [10]:
import pandas as pd

# 假设CSV文件的路径分别是 'data1.csv' 和 'data2.csv'
file_path1 = 'D:/yanyi/perm/resultdelta/random_diffs60HCBLRpos.csv'
file_path2 = 'D:/yanyi/perm/60BLRresultpos/pos_realdelta.csv'

# 读取CSV文件
data1 = pd.read_csv(file_path1,header=None)
data2 = pd.read_csv(file_path2,header=None)

# 确保两个CSV文件都有148列
if data1.shape[1] != 148 or data2.shape[1] != 148:
    raise ValueError("CSV文件必须各有148列。")

# 初始化一个空的DataFrame，用于存储结果
results_df = pd.DataFrame(index=range(148), columns=['GreaterThanCount'])

# 对每一列数据执行操作
for col_idx in range(148):
    # 获取data2中对应列的值
    threshold = data2.iloc[0, col_idx]
    
    # 计算data1中大于threshold的值的数量
    greater_than_count = (data1.iloc[:, col_idx] >= threshold).sum()
    
    # 将结果存储到DataFrame中
    results_df.at[col_idx, 'GreaterThanCount'] = greater_than_count

# 将结果DataFrame写入新的 CSV 文件
output_csv_path = 'D:/yanyi/perm/resultdelta/comparison_results60HCBLRpos.csv'
results_df.to_csv(output_csv_path, index=True)

print(f"每一列大于给定阈值的数据个数已保存至 '{output_csv_path}' 文件。")

每一列大于给定阈值的数据个数已保存至 'D:/yanyi/perm/resultdelta/comparison_results60HCBLRpos.csv' 文件。


In [9]:
import numpy as np
from statsmodels.stats.multitest import multipletests

# 假设你已经有了一个包含148个p值的列表
p_values = [0.01639836,0.00579942,0.01619838,0.239576042,0.202679732,0.077092291,0.195480452,0.194980502,0.615738426,0.198180182,0.00269973,0.184881512,1,0.237776222,0.00479952,0.00129987,0.615338466,0.676132387,0.412658734,0.083391661,0.193780622,0.206279372,0.02179782,0.01509849,0.02249775,0.04119588,0.445855414,0.093090691,0.196680332,0.00229977,0.601139886,1,0.188081192,0.0169983,0.249675032,0.00829917,0.205879412,0.077892211,1,0.434656534,0.611438856,0.00069993,0.185581442,1,0.01509849,0.00469953,1,0.765223478,0.420957904,0.440755924,0.442255774,0.00109989,0.02859714,0.00019998,0.454054595,0.00039996,0.141785821,9.999E-05,0.01959804,0.03529647,0.086291371,9.999E-05,0.454754525,1,0.245475452,0.435656434,0.210378962,0.00039996,0.03529647,0.086791321,0.084991501,0.084691531,0.00119988,0.453054695,0.621437856,0.00579942,0.191980802,0.00089991,1,0.02379762,0.605339466,0.076792321,0.444455554,1,0.01529847,0.247575242,0.575442456,0.412158784,0.03769623,0.02469753,0.834516548,1,0.244175582,0.187581242,0.248675132,0.228877112,0.02089791,0.091790821,0.080391961,0.0349965,0.136886311,0.00129987,0.835116488,0.02159784,0.01669833,1,0.01309869,0.142085791,0.256474353,0.02279772,0.240275972,0.00509949,0.611238876,1,0.843715628,0.00029997,1,0.191180882,0.01809819,0.00149985,0.96470353,0.938006199,0.189281072,0.441555844,0.082491751,0.03559644,0.254074593,0.00519948,0.01629837,0.01549845,0.081791821,1,0.00569943,0.190080992,0.03679632,0.01509849,0.078492151,0.441955804,0.238176182,0.191880812,0.03769623,0.03489651,0.00319968,0.241675832,0.01539846,0.03679632,0.00729927,0.831416858]
# 将p值列表转换为NumPy数组
p_values_array = np.array(p_values)

# 使用Benjamini-Hochberg程序进行多重比较校正
_, corrected_p_values, _, _ = multipletests(p_values_array, method='fdr_bh')

# 输出校正后的p值
print(corrected_p_values)

[0.0679932  0.03731801 0.0679932  0.35875636 0.33548733 0.18616109
 0.33330303 0.33330303 0.72324831 0.33330303 0.02663734 0.33330303
 1.         0.35875636 0.03731801 0.01603173 0.72324831 0.78177807
 0.56557706 0.18616109 0.33330303 0.33548733 0.07668324 0.0679932
 0.07668324 0.10696474 0.56557706 0.1940482  0.33330303 0.02431185
 0.72324831 1.         0.33330303 0.0679932  0.35875636 0.04913109
 0.33548733 0.18616109 1.         0.56557706 0.72324831 0.01479852
 0.33330303 1.         0.0679932  0.03731801 1.         0.87793081
 0.56557706 0.56557706 0.56557706 0.01603173 0.09005057 0.00986568
 0.56557706 0.00986568 0.28417158 0.00739926 0.07437205 0.09962575
 0.18616109 0.00739926 0.56557706 1.         0.35875636 0.56557706
 0.33843572 0.00986568 0.09962575 0.18616109 0.18616109 0.18616109
 0.01603173 0.56557706 0.7241953  0.03731801 0.33330303 0.01603173
 1.         0.07826773 0.72324831 0.18616109 0.56557706 1.
 0.0679932  0.35875636 0.70971236 0.56557706 0.09962575 0.07946162
 0.9