In [5]:
import re
import os
import numpy as np
import csv
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import norm
import time
import random 
import pandas as pd
import multiprocessing
from sklearn.linear_model import LogisticRegression
import glob
import os



In [6]:
def sim_summary(file_name):
    # Extract information from file name
    pattern = (
        r"N=(?P<N>\d+)_"
        r"k_N=(?P<k_N>\d+)_"
        r"m_N=(?P<m_N>\d+)_"
        r"p=(?P<p>\d+)_"
    )
    
    match = re.search(pattern, file_name)
    
    params = {k: int(v) for k, v in match.groupdict().items()}
    
    N = params["N"]
    m_N = params["m_N"]
    k_N = params["k_N" ]
    p = params["p"]
    beta_true = np.r_[np.linspace(-1, -0.5, 6), np.linspace(0.5, 1, 6)]

    # Read the results
    df = pd.read_csv(file_name)
    
    beta_headers = [f"lasso_true beta_{i}" for i in range(1, 13)]
    alpha = k_N * m_N / N
    sqrt_val = float(np.sqrt(1 + 1/alpha))
    
    # helper: make a row of total width = 3 + 12 = 15 columns
    def row(c0="", c1="", c2="", vec=None):
        if vec is None:
            vec = [""] * 12
        else:
            vec = list(vec)
            assert len(vec) == 12
        return [c0, c1, c2] + vec

    # vectors (length 12)
    BIAS = (df.iloc[:, 1:13].mean().to_numpy() - beta_true)
    SD   = df.iloc[:, 1:13].std(ddof=1).to_numpy()
    SE   = df.iloc[:, (p+1):(p+13)].mean().to_numpy()
    CI   = df.iloc[:, (p+13):(p+25)].mean().to_numpy()
    
    # scalars
    TPR = np.count_nonzero(df.iloc[:, 1:13].to_numpy()) / df.iloc[:, 1:13].size * 100 
    FPR = np.count_nonzero(df.iloc[:, 13:(p+1)].to_numpy()) / df.iloc[:, 13:(p+1)].size * 100 
    
    
    
    grid = []
    
    # Row 1: parameter labels (put them in the first 4 cells, rest blank)
    grid.append(["N=","k_N =", "m_N =", "p="] + [""] * (15 - 4))
    
    # Row 2: parameter values
    grid.append([N, k_N, m_N, p] + [""] * (15 - 4))
    
    # Row 3: alpha and sqrt, then beta headers
    grid.append(row(f"alpha={alpha}", "sqrt((1+1/alpha))=", f"{sqrt_val}", beta_headers))
    
    # Row 4: True beta
    # note: keep col0 empty so "True beta" is visually indented like your screenshot
    grid.append(row("", "True beta", "", beta_true))
    
    # Rows 5–8: metrics
    grid.append(row("", "BIAS", "", BIAS))
    grid.append(row("", "SD",   "", SD))
    grid.append(row("", "SE",   "", SE))
    grid.append(row("", "CI",   "", CI))
    
    # Row 9–10: TPR/FPR only once (rest blank)
    grid.append(row("", "TPR", "", [TPR] + [""] * 11))
    grid.append(row("", "FPR", "", [FPR] + [""] * 11))

    blank_row = [""] * 15

    # add three blank rows
    grid.extend([blank_row, blank_row, blank_row])
    
    report_df = pd.DataFrame(grid)
    
    # write to CSV exactly as a sheet-like grid
    report_df.to_csv(
        "summary_p="+ str(p) + ".csv",
        mode="a",                          # append
        index=False,
        header=False,
        sep=","
    )


In [7]:
# # For test
# file_name = "N=1000000_k_N=99999_m_N=11_p=100_.csv"
# sim_summary(file_name)

In [8]:
# get the list files in a certain folder
path = r'.' 

# Create a list of all files ending with '.csv' in the specified path
csv_files_list = glob.glob(os.path.join(path, '*.csv'))

# Define a function to sort files by N, k_N and m_N
def parse_params(fname):
    return tuple(
        int(x) for x in re.search(
            r"N=(\d+)_k_N=(\d+)_m_N=(\d+)_p=(\d+)",
            fname
        ).groups()[:3]
    )

for file_name in sorted(csv_files_list, key=parse_params):
    sim_summary(file_name)
