In [1]:
import pandas as pd
import numpy as np
from src.preprocess import Preprocess
from src.clustering import Clustering
from src.feature_selector import FDRFeatureSelector

from src.regression import Regression
import argparse
def parse_args():
    parser = argparse.ArgumentParser(description="Process Excel data.")
    parser.add_argument('--data_path', type=str, default='data/data2.xlsx', help='Path to the Excel file')
    parser.add_argument('--small_value', type=float, default=1e-6, help='Small value to replace zeros in the data')
    parser.add_argument('--num_clusters', type=int, default=5, help='Number of clusters for clustering algorithms')
    parser.add_argument('--num_nearest_points', type=int, default=11, help='Number of nearest points to consider for each cluster center')
    parser.add_argument('--normalization_method', type=str, default='naive', help='Normalization method to apply to the data, options: log, clr, naive')
    parser.add_argument('--fdr_threshold', type=float, default=0.1, help='FDR threshold for feature selection')
    parser.add_argument('--distance_metric', type=str, default='pearson', help='euclidean, spearman, pearson, cosine, or correlation distance metric')
    parser.add_argument('--embedding_num', type=int, default=10, help='Number of dimensions for low-dimensional representation')
    parser.add_argument('--embedding_method', type=str, default='pca', help='Method for low-dimensional representation, options: pca, laplacian')

    return parser.parse_args("")

In [2]:
args = parse_args() 
# Load data from Excel file
preprocessor = Preprocess(args)
data = preprocessor.get_processed_data() # data would be shape of n_features x n_samples

fdr_feature_selector = FDRFeatureSelector(args, data,preprocessor.feature_dict)
fdr_feature_selector.select_features()  # Select features based on FDR
fdr_feature_selector.display_sorted_features()  # Display sorted features by p-value




FDR correction complete. Call 'display_sorted_features()' to see results.

--- Comparison 1: Normal vs. Abnormal (Sorted by p-value) ---
hsa-let-7i-5p: 0.779615
hsa-miR-142-5p: 0.779615
hsa-miR-143-3p: 0.779615
hsa-miR-17-5p: 0.779615
hsa-miR-320e: 0.779615
hsa-miR-628-3p: 0.779615
hsa-miR-664a-3p: 0.779615
hsa-let-7b-3p: 0.842743
hsa-let-7d-3p: 0.842743
hsa-let-7d-5p: 0.842743
hsa-let-7f-1-3p: 0.842743
hsa-let-7g-5p: 0.842743
hsa-miR-101-3p: 0.842743
hsa-miR-103a-3p: 0.842743
hsa-miR-106a-5p: 0.842743
hsa-miR-107: 0.842743
hsa-miR-122-3p: 0.842743
hsa-miR-1228-5p: 0.842743
hsa-miR-1229-3p: 0.842743
hsa-miR-1246: 0.842743

--- Comparison 2: Normal vs. Alzheimer (Sorted by p-value) ---
hsa-miR-191-5p: 0.002783
hsa-miR-628-3p: 0.008567
hsa-let-7b-5p: 0.010632
hsa-miR-574-3p: 0.018297
hsa-miR-197-3p: 0.088071
hsa-miR-32-5p: 0.088071
hsa-miR-423-3p: 0.088071
hsa-miR-664a-3p: 0.088071
hsa-mir-temp_6_1: 0.088071
hsa-miR-126-5p: 0.091754
hsa-miR-1306-5p: 0.091754
hsa-miR-451a: 0.091754
hsa-mi

In [3]:
clustering= Clustering(args, data)
clustering.kmeans(n_clusters=args.num_clusters)  # Perform KMeans clustering



Low-dimensional representation shape:           PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0   -0.015888  0.005927  0.000207 -0.001180  0.003119  0.000924 -0.001141   
1    0.293552 -0.273942 -0.085522  0.060626 -0.002597 -0.025457  0.020458   
2   -0.016447  0.005864 -0.002160 -0.001126  0.001400  0.000561 -0.001346   
3    0.527915 -0.447150 -0.160296  0.043676 -0.024463 -0.024094 -0.053283   
4   -0.005269 -0.002335 -0.005319  0.002683  0.000179  0.001636 -0.000021   
..        ...       ...       ...       ...       ...       ...       ...   
244 -0.012707  0.004306  0.004501  0.003803  0.005097  0.001065  0.000811   
245 -0.006778  0.006646  0.011600  0.007266  0.005807  0.001605  0.000420   
246 -0.017244  0.009542 -0.002433  0.001176  0.000533 -0.000417 -0.001266   
247 -0.017470  0.009000 -0.002340  0.001335  0.000955 -0.000249 -0.001341   
248 -0.018013  0.008998 -0.004723 -0.000074  0.001241 -0.000534 -0.000070   

          PC8       PC9      PC10  
0

Unnamed: 0,H1_Read_Count,H2_Read_Count,H3_Read_Count,H4_Read_Count,H5_Read_Count,H6_Read_Count,H7_Read_Count,H8_Read_Count,H9_Read_Count,H10_Read_Count,...,A2_Read_Count,A3_Read_Count,A4_Read_Count,A5_Read_Count,A6_Read_Count,A7_Read_Count,A8_Read_Count,A9_Read_Count,A10_Read_Count,cluster
0,0.000775,0.000473,0.000541,0.000419,6.580390e-05,0.001871,0.000769,3.592212e-11,1.070910e-03,5.913661e-11,...,0.006648,0.000760,8.936549e-10,1.198930e-03,0.000165,0.000709,1.979022e-03,3.913894e-03,2.453988e-10,0
1,0.087620,0.090870,0.098001,0.146736,3.323097e-02,0.086128,0.093995,7.741217e-02,8.122030e-02,4.139562e-02,...,0.032356,0.076299,8.042894e-03,3.624458e-02,0.174832,0.027502,1.682169e-02,1.956947e-02,4.441718e-02,2
2,0.001054,0.001157,0.001245,0.000585,4.386927e-05,0.001363,0.001082,3.592212e-11,1.228284e-03,2.365464e-04,...,0.002290,0.000894,8.936549e-10,2.305635e-03,0.000329,0.000315,1.187413e-03,9.784735e-04,2.453988e-10,0
3,0.202994,0.221725,0.189186,0.204968,9.541566e-02,0.171861,0.161046,1.480351e-01,1.182569e-01,8.178592e-02,...,0.056955,0.117310,1.161751e-02,2.204187e-02,0.151728,0.045863,2.988324e-02,2.707110e-02,5.349693e-02,2
4,0.004160,0.004064,0.004547,0.003704,1.316078e-03,0.004138,0.002783,3.987355e-03,3.043842e-03,1.714962e-03,...,0.002586,0.003265,1.787310e-03,2.674537e-03,0.005494,0.001655,9.895112e-04,6.523157e-04,5.153374e-03,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,0.000474,0.000322,0.000608,0.000205,2.193463e-05,0.000148,0.000159,1.077664e-04,1.765659e-04,5.913661e-11,...,0.002807,0.013238,3.574620e-03,1.106705e-03,0.002886,0.004492,8.707698e-03,1.304631e-03,4.171779e-03,0
245,0.000585,0.000723,0.001950,0.000712,1.316078e-04,0.000212,0.000124,6.106761e-04,1.151517e-04,1.182732e-04,...,0.005171,0.005926,1.697944e-02,3.227889e-03,0.004747,0.005516,1.385316e-02,3.261578e-03,8.588957e-03,0
246,0.000093,0.000013,0.000029,0.000004,2.632156e-04,0.000071,0.000045,3.232991e-04,4.606066e-05,3.548196e-04,...,0.000295,0.000112,8.936549e-10,1.567832e-03,0.000025,0.000315,1.187413e-03,3.587736e-03,4.907975e-04,0
247,0.000056,0.000092,0.000270,0.000127,2.193463e-11,0.000042,0.000015,1.077664e-04,1.535355e-05,5.913661e-05,...,0.000369,0.000447,8.936549e-04,4.611270e-04,0.000443,0.000394,1.979022e-03,9.784735e-04,2.208589e-03,0


In [4]:
regression = Regression(args,clustering)
regression.do_regression()  # Process data for regression


>>> 클러스터 0에 대한 회귀 분석 시작...
--- 클러스터 0 분석 결과 요약 ---
                            OLS Regression Results                            
Dep. Variable:                    230   R-squared:                       0.605
Model:                            OLS   Adj. R-squared:                  0.310
Method:                 Least Squares   F-statistic:                     2.046
Date:                Fri, 27 Jun 2025   Prob (F-statistic):             0.0906
Time:                        09:49:39   Log-Likelihood:                 186.04
No. Observations:                  29   AIC:                            -346.1
Df Residuals:                      16   BIC:                            -328.3
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------