In [1]:
import pandas as pd
import numpy as np
from scipy import stats

1. 分析bootstrap结果

In [2]:
def analyze_bootstrap_results(beta, betas):
    """
    分析bootstrap结果
    
    参数:
    beta: 原始参数估计值（一行数据框）
    betas: bootstrap重抽样结果（500行数据框）
    """
    # 计算betas的方差
    variances = betas.var()
    
    # 创建结果数据框
    results_df = pd.DataFrame({
        '参数名': betas.columns,
        '参数估计值': beta['beta'].values,  # 取第一行，因为beta只有一行
        '方差': variances
    })
    
    # 计算p值（假设检验：H0: 参数 = 0）
    # 使用z检验，因为样本量较大
    results_df['p值'] = 2 * (1 - stats.norm.cdf(
        abs(results_df['参数估计值'] / np.sqrt(results_df['方差']))
    ))
    
    # 计算相对标准误（变异系数）
    results_df['相对标准误'] = np.sqrt(results_df['方差']) / abs(betas.mean())
    
    # 按p值排序
    results_df = results_df.sort_values('p值')
    
    return results_df

In [3]:
df_names = pd.read_csv(r"D:\a马文泉\a大三下学期\数值分析\期末大作业\预处理数据与代码\test.csv")
names = df_names.columns[:-1]
beta1 = pd.read_csv(r"D:\a马文泉\a大三下学期\数值分析\期末大作业\bootstrap得到的解的数据和代码\秦孝骋参数估计.csv")
betas1 = pd.read_csv(r"D:\a马文泉\a大三下学期\数值分析\期末大作业\bootstrap得到的解的数据和代码\秦孝骋bootstrap.csv")
betas1.columns = names
print(len(beta1))
print(len(betas1.columns))
beta2 = pd.read_csv(r"D:\a马文泉\a大三下学期\数值分析\期末大作业\bootstrap得到的解的数据和代码\陈子帆参数估计.csv")
betas2 = pd.read_csv(r"D:\a马文泉\a大三下学期\数值分析\期末大作业\bootstrap得到的解的数据和代码\陈子帆bootstrap.csv")
betas2.columns = names
print(len(beta2))
print(len(betas2.columns))
betas3 = pd.read_csv(r"D:\a马文泉\a大三下学期\数值分析\期末大作业\bootstrap得到的解的数据和代码\陆泓伯bootstrap.csv")
betas3.columns = names
beta3 = pd.read_csv(r"D:\a马文泉\a大三下学期\数值分析\期末大作业\bootstrap得到的解的数据和代码\陆泓伯参数估计.csv")
print(len(beta3))
print(len(betas3.columns))
print(beta1.head())
print(betas1.head())
print(beta2.head())
print(betas2.head())
print(beta3.head())
print(betas3.head())

261
261
261
261
261
261
       beta
0  0.029039
1 -0.000004
2 -0.062183
3  0.043947
4  0.261629
        PID  MS SubClass  Lot Frontage  Lot Area  Overall Qual  Overall Cond  \
0  0.030345    -0.000003     -0.000002  0.013242      0.311503 -3.176173e-07   
1  0.038971    -0.036803     -0.003645  0.085900      0.416898 -1.134745e-02   
2  0.039396    -0.147136      0.024891  0.081652      0.398878  2.736301e-02   
3  0.020946    -0.000001     -0.065067  0.046047      0.272111  4.360247e-02   
4  0.070150    -0.000001      0.000005  0.022446      0.251249  3.715906e-02   

   Year Built  Year Remod/Add  Mas Vnr Area  BsmtFin SF 1  ...  \
0    0.062631       -0.000004      0.038404      0.067975  ...   
1    0.212735       -0.071541      0.055768      0.218156  ...   
2    0.130812       -0.005397      0.034490      0.240125  ...   
3    0.163635        0.000004      0.109513      0.000002  ...   
4    0.177308       -0.040760      0.020097      0.232295  ...   

   Garage Type_freq_rank  

In [None]:
df1 = analyze_bootstrap_results(beta1, betas1)
df2 = analyze_bootstrap_results(beta2, betas2)
df3 = analyze_bootstrap_results(beta3, betas3)
# df1.to_csv(r".\qxc假设检验.csv", index=False)
# df2.to_csv(r".\czf假设检验.csv", index=False)
# df3.to_csv(r".\lhb假设检验.csv", index=False)

In [5]:
print(df1.columns.tolist())

['参数名', '参数估计值', '方差', 'p值', '相对标准误']


2. 对相关系数矩阵进行假设检验

In [6]:
def correlation_hypothesis_test(correlation_matrix, n_samples, name_list, alpha=0.05):
    """
    对相关系数矩阵进行假设检验
    
    参数:
    correlation_matrix: 相关系数矩阵
    n_samples: 样本量（在bootstrap情况下是重复次数）
    alpha: 显著性水平，默认0.05
    
    返回:
    包含检验结果的DataFrame
    """
    n_features = correlation_matrix.shape[0]
    test_results = []
    
    for i in range(len(name_list)):
        for j in range(i+1, len(name_list)):
            r = correlation_matrix[i, j]
            # 计算t统计量
            t_stat = r * np.sqrt((n_samples - 2) / (1 - r**2))
            # 计算p值
            p_value = 2 * (1 - stats.t.cdf(abs(t_stat), n_samples - 2))
            # 计算95%置信区间
            z = np.arctanh(r)
            se = 1 / np.sqrt(n_samples - 3)
            ci_lower = np.tanh(z - 1.96 * se)
            ci_upper = np.tanh(z + 1.96 * se)
            
            test_results.append({
                '变量1': name_list[i],
                '变量2': name_list[j],
                '相关系数': r,
                't统计量': t_stat,
                'p值': p_value,
                '95%置信区间下限': ci_lower,
                '95%置信区间上限': ci_upper,
                '是否显著': p_value < alpha
            })
    test_results_df = pd.DataFrame(test_results)
    test_results_df = test_results_df.sort_values('p值', ascending=True)
    return test_results_df

In [7]:
def analyze_all_correlations(betas, alpha=0.1):
    """
    对所有变量进行相关系数分析和假设检验
    
    参数:
    betas: bootstrap得到的参数估计矩阵
    alpha: 显著性水平，默认0.05
    
    返回:
    包含检验结果的DataFrame
    """
    # 1. 计算相关系数矩阵
    correlation_matrix = np.corrcoef(betas.T)
    variable_names = betas.columns.tolist()
    
    # 2. 进行假设检验
    n_samples = betas.shape[0]  # bootstrap重复次数
    test_results = []
    
    for i in range(len(variable_names)):
        for j in range(i+1, len(variable_names)):
            r = correlation_matrix[i, j]
            # 计算t统计量
            t_stat = r * np.sqrt((n_samples - 2) / (1 - r**2))
            # 计算p值
            p_value = 2 * (1 - stats.t.cdf(abs(t_stat), n_samples - 2))
            # 计算95%置信区间
            z = np.arctanh(r)
            se = 1 / np.sqrt(n_samples - 3)
            ci_lower = np.tanh(z - 1.96 * se)
            ci_upper = np.tanh(z + 1.96 * se)
            
            test_results.append({
                '变量1': variable_names[i],
                '变量2': variable_names[j],
                '相关系数': r,
                't统计量': t_stat,
                'p值': p_value,
                '95%置信区间下限': ci_lower,
                '95%置信区间上限': ci_upper,
                '是否显著': p_value < alpha
            })
    
    # 3. 整理结果
    test_results_df = pd.DataFrame(test_results)
    test_results_df = test_results_df.sort_values('p值', ascending=True)
    
    # 4. 输出显著相关的变量对
    significant_pairs = test_results_df[test_results_df['是否显著']]
    print(f"\n发现 {len(significant_pairs)} 对显著相关的变量 (p < {alpha}):")
    print(significant_pairs[['变量1', '变量2', '相关系数', 'p值', '95%置信区间下限', '95%置信区间上限']])
    
    # 5. 输出高度相关的变量对（相关系数绝对值大于0.7）
    high_corr_pairs = test_results_df[
        (test_results_df['是否显著']) & 
        (abs(test_results_df['相关系数']) > 0.7)
    ]
    print(f"\n发现 {len(high_corr_pairs)} 对显著且高度相关的变量 (|r| > 0.7):")
    print(high_corr_pairs[['变量1', '变量2', '相关系数', 'p值', '95%置信区间下限', '95%置信区间上限']])
    
    return {
        'correlation_matrix': correlation_matrix,
        'test_results': test_results_df,
        'significant_pairs': significant_pairs,
        'high_corr_pairs': high_corr_pairs
    }


In [8]:
def analyze_bootstrap_correlations(beta, betas, alpha1=0.05, alpha2=0.05):
    """
    分析bootstrap结果并进行相关系数分析
    
    参数:
    beta: 原始参数估计值（一行数据框）
    betas: bootstrap重抽样结果（500行数据框）
    alpha1: 控制参数显著性的水平（用于筛选显著非0的变量）
    alpha2: 控制相关系数显著性的水平
    """
    # 1. 首先进行bootstrap结果分析，筛选显著非0的变量
    results_df = analyze_bootstrap_results(beta, betas)
    
    # 筛选显著非0的变量
    significant_vars = results_df[results_df['p值'] < alpha1]['参数名'].tolist()
    print(f"\n在显著性水平 {alpha1} 下，发现 {len(significant_vars)} 个显著非0的变量")
    
    # 2. 对显著变量进行相关系数分析
    betas_significant = betas[significant_vars]
    correlation_matrix = np.corrcoef(betas_significant.T)
    
    # 3. 进行相关系数假设检验
    n_samples = betas.shape[0]  # bootstrap重复次数
    test_results = []
    
    for i in range(len(significant_vars)):
        for j in range(i+1, len(significant_vars)):
            r = correlation_matrix[i, j]
            # 计算t统计量
            t_stat = r * np.sqrt((n_samples - 2) / (1 - r**2))
            # 计算p值
            p_value = 2 * (1 - stats.t.cdf(abs(t_stat), n_samples - 2))
            # 计算95%置信区间
            z = np.arctanh(r)
            se = 1 / np.sqrt(n_samples - 3)
            ci_lower = np.tanh(z - 1.96 * se)
            ci_upper = np.tanh(z + 1.96 * se)
            
            test_results.append({
                '变量1': significant_vars[i],
                '变量2': significant_vars[j],
                '相关系数': r,
                't统计量': t_stat,
                'p值': p_value,
                '95%置信区间下限': ci_lower,
                '95%置信区间上限': ci_upper,
                '是否显著': p_value < alpha2
            })
    
    # 4. 整理结果
    test_results_df = pd.DataFrame(test_results)
    test_results_df = test_results_df.sort_values('p值', ascending=True)
    
    # 5. 输出显著相关的变量对
    significant_pairs = test_results_df[test_results_df['是否显著']]
    print(f"\n在相关系数显著性水平 {alpha2} 下，发现 {len(significant_pairs)} 对显著相关的变量:")
    print(significant_pairs[['变量1', '变量2', '相关系数', 'p值', '95%置信区间下限', '95%置信区间上限']])
    
    # 6. 输出高度相关的变量对（相关系数绝对值大于0.7）
    high_corr_pairs = test_results_df[
        (test_results_df['是否显著']) & 
        (abs(test_results_df['相关系数']) > 0.7)
    ]
    print(f"\n发现 {len(high_corr_pairs)} 对显著且高度相关的变量 (|r| > 0.7):")
    print(high_corr_pairs[['变量1', '变量2', '相关系数', 'p值', '95%置信区间下限', '95%置信区间上限']])
    
    # 7. 返回所有结果
    return {
        'bootstrap_results': results_df,
        'significant_vars': significant_vars,
        'correlation_matrix': correlation_matrix,
        'test_results': test_results_df,
        'significant_pairs': significant_pairs,
        'high_corr_pairs': high_corr_pairs
    }


In [9]:
results = analyze_bootstrap_correlations(beta1, betas1, alpha1=0.05, alpha2=0.05)
results['significant_pairs']


在显著性水平 0.05 下，发现 15 个显著非0的变量

在相关系数显著性水平 0.05 下，发现 59 对显著相关的变量:
                        变量1                     变量2      相关系数            p值  \
0         Neighborhood_mean             Gr Liv Area -0.416260  0.000000e+00   
3         Neighborhood_mean  Neighborhood_mean_rank -0.860093  0.000000e+00   
15              Gr Liv Area              2nd Flr SF  0.589399  0.000000e+00   
33             Overall Qual       Overall Qual_rank -0.969911  0.000000e+00   
41               2nd Flr SF         2nd Flr SF_rank -0.628970  0.000000e+00   
78               Year Built       Overall Qual_rank -0.388451  0.000000e+00   
96                Full Bath          Full Bath_rank -0.975306  0.000000e+00   
31             Overall Qual              Year Built  0.351339  4.440892e-16   
8         Neighborhood_mean       Overall Qual_rank  0.349117  8.881784e-16   
1         Neighborhood_mean            Overall Qual -0.346660  1.332268e-15   
16              Gr Liv Area  Neighborhood_mean_rank  0.316676  4.1

Unnamed: 0,变量1,变量2,相关系数,t统计量,p值,95%置信区间下限,95%置信区间上限,是否显著
0,Neighborhood_mean,Gr Liv Area,-0.41626,-10.2164,0.0,-0.486204,-0.341016,True
3,Neighborhood_mean,Neighborhood_mean_rank,-0.860093,-37.624695,0.0,-0.881313,-0.83541,True
15,Gr Liv Area,2nd Flr SF,0.589399,16.2816,0.0,0.529051,0.643815,True
33,Overall Qual,Overall Qual_rank,-0.969911,-88.903096,0.0,-0.974701,-0.96423,True
41,2nd Flr SF,2nd Flr SF_rank,-0.62897,-18.0544,0.0,-0.6792,-0.572875,True
78,Year Built,Overall Qual_rank,-0.388451,-9.407409,0.0,-0.460458,-0.311365,True
96,Full Bath,Full Bath_rank,-0.975306,-98.546548,0.0,-0.979246,-0.970628,True
31,Overall Qual,Year Built,0.351339,8.374321,4.440892e-16,0.272028,0.425909,True
8,Neighborhood_mean,Overall Qual_rank,0.349117,8.31399,8.881784e-16,0.269681,0.423834,True
1,Neighborhood_mean,Overall Qual,-0.34666,-8.247448,1.332268e-15,-0.421538,-0.267087,True
