## Linear Regression for relationship between mutation rate, CDR distance, local identity, and chromosome 

In [1]:
import pandas as pd
from pathlib import Path
import glob
import os
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Load in all pairs df 

# Directory containing CSV files
pairwise_dist_dir = "/private/groups/patenlab/mira/centrolign/batch_submissions/centrolign/release2_QC_v2/all_pairs/distance_matrices/"  # this should be a folder path

# Collect all matching files
files = glob.glob(os.path.join(pairwise_dist_dir, "*_r2_QC_v2_centrolign_pairwise_distance.csv"))

all_dfs = []

for f in files:
    # Extract chr from filename
    # e.g. filename: "chr3_r2_QC_v2_centrolign_pairwise_distance.csv"
    basename = os.path.basename(f)
    chr_val = basename.split("_")[0]   # 'chr3'

    # Read CSV
    df = pd.read_csv(f, header=None,names=["sample1", "sample2", "direct_pairwise_dist","chr"])
    df["chr"] = chr_val
    all_dfs.append(df)

# Combine into one dataframe
all_pairs_dist_df = pd.concat(all_dfs, ignore_index=True)

all_pairs_dist_df["sample_pair"] = (
    all_pairs_dist_df[["sample1", "sample2"]]
    .apply(lambda x: "_".join(sorted(x)), axis=1)
)

all_pairs_dist_df.head()

Unnamed: 0,sample1,sample2,direct_pairwise_dist,chr,sample_pair
0,HG03098.1,HG03784.1,0.942968,chr6,HG03098.1_HG03784.1
1,HG03098.1,NA20752.2,0.974936,chr6,HG03098.1_NA20752.2
2,HG01978.2,NA19835.1,0.999883,chr6,HG01978.2_NA19835.1
3,HG00741.1,NA18522.1,0.999869,chr6,HG00741.1_NA18522.1
4,HG00408.1,HG01496.1,0.606856,chr6,HG00408.1_HG01496.1


In [3]:
# Get list of sample pairs with dist < 0.2 
all_pairs_dist_df["sample_pair"] = (
    all_pairs_dist_df[["sample1", "sample2"]]
    .apply(lambda x: "_".join(sorted(x)), axis=1)
)

# Subset to distances < 0.2
allowed_pairs = set(
    all_pairs_dist_df.loc[all_pairs_dist_df["direct_pairwise_dist"] < 0.2, "sample_pair"]
)

### Starting with all SNVs (unfiltered)

In [18]:
base_path = Path("/private/groups/patenlab/mira/centrolign/analysis/CDR_variant_regression/SNVs_pairwise_raw")

# BED column names as given
columns = [
    "contig",
    "start",
    "end",
    "local_identity",
    "CDR_dist",
    "SNV_count1",
    "SNV_count2",
    "windowsize",
    "SNV_rate",
]

dfs = []

for chr_dir in base_path.glob("chr*"):
    if not chr_dir.is_dir():
        continue

    chr_name = chr_dir.name

    for bed_file in chr_dir.glob("*.bed"):
        # Extract sample names
        name_part = bed_file.name.split(".SNVs.all_snvs.local_id_CDR_dist.bed")[0]
        sample1, sample2 = name_part.split("_", 1)
        sample_pair = "_".join(sorted([sample1, sample2]))

        # Skip pairs not in allowed set
        if sample_pair not in allowed_pairs:
            continue

        df = pd.read_csv(
            bed_file,
            sep="\t",
            header=None,
            names=columns,
        )

        df["chr"] = chr_name
        df["sample1"] = sample1
        df["sample2"] = sample2
        df["sample_pair"] = sample_pair
        
        df = df.drop(columns=["SNV_count2", "windowsize"])

        dfs.append(df)

# Combine only the filtered data
all_snvs_df = pd.concat(dfs, ignore_index=True)


  all_snvs_df = pd.concat(dfs, ignore_index=True)


In [19]:
all_snvs_df["CDR_dist"] = all_snvs_df["CDR_dist"].abs()

# convert chr values to numeric
chr_map = {f"chr{i}": i for i in range(1, 23)}
chr_map.update({"chrX": 23, "chrY": 24})

all_snvs_df["chr_num"] = all_snvs_df["chr"].map(chr_map)

all_snvs_df.head()

Unnamed: 0,contig,start,end,local_identity,CDR_dist,SNV_count1,SNV_rate,chr,sample1,sample2,sample_pair,chr_num
0,HG00741#1#CM087903.1,50996952,51001951,97.69,1923992,0,0.0,chr11,HG00741.1,HG00272.2,HG00272.2_HG00741.1,11
1,HG00741#1#CM087903.1,51001952,51006951,96.95,1918992,0,0.0,chr11,HG00741.1,HG00272.2,HG00272.2_HG00741.1,11
2,HG00741#1#CM087903.1,51006952,51011951,96.75,1913992,0,0.0,chr11,HG00741.1,HG00272.2,HG00272.2_HG00741.1,11
3,HG00741#1#CM087903.1,51011952,51016951,96.61,1908992,0,0.0,chr11,HG00741.1,HG00272.2,HG00272.2_HG00741.1,11
4,HG00741#1#CM087903.1,51016952,51021951,96.93,1903992,0,0.0,chr11,HG00741.1,HG00272.2,HG00272.2_HG00741.1,11


In [20]:
# Ensure numeric
numeric_cols = ["SNV_rate", "CDR_dist", "local_identity", "chr_num"]

for col in numeric_cols:
    all_snvs_df[col] = pd.to_numeric(all_snvs_df[col], errors="coerce")

all_snvs_df = all_snvs_df.dropna(subset=numeric_cols)


In [21]:
# Confirm numeric
print(all_snvs_df[["SNV_rate", "CDR_dist", "local_identity", "chr_num"]].dtypes)


SNV_rate          float64
CDR_dist            int64
local_identity    float64
chr_num             int64
dtype: object


In [22]:
# Make sure memory isn't bad 
n_rows = len(all_snvs_df)
print(f"Design matrix size ~ {n_rows} x 4")
print(f"Approx RAM: {n_rows * 4 * 8 / 1e6:.1f} MB")


Design matrix size ~ 14765787 x 4
Approx RAM: 472.5 MB


In [23]:
import numpy as np
import statsmodels.api as sm

X = np.column_stack([
    all_snvs_df["CDR_dist"].values,
    all_snvs_df["local_identity"].values,
    all_snvs_df["chr_num"].values
])

# SNV_rate ~ CDR_dist + local_identity + chr_num

X = sm.add_constant(X)
y = all_snvs_df["SNV_rate"].values

model = sm.OLS(y, X).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                 1.427e+04
Date:                Mon, 12 Jan 2026   Prob (F-statistic):               0.00
Time:                        17:52:27   Log-Likelihood:             7.3234e+07
No. Observations:            14765787   AIC:                        -1.465e+08
Df Residuals:                14765783   BIC:                        -1.465e+08
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0024   2.05e-05    116.814      0.0

Rsquared of 0.003:
Only 0.3% of the variance in SNV_rate is explained by CDR_dist, local_identity, and chr_num.

With 14 million observations, even tiny effects become “statistically significant,” which is why all p-values are zero.

Practical interpretation: coefficients are real but very small, the model explains almost nothing.

Coefficients: 
Magnitudes are tiny, especially for CDR_dist (1e-11)
Even though t-values are large (14M rows → massive statistical power), effects are essentially negligible.
So “statistically significant” ≠ “biologically meaningful.”

Condition number: 5.16e7 → large, indicates potential multicollinearity or variables on very different scales.

CDR_dist is probably very small (~0–1?)
local_identity and chr_num are larger (~0–1, 1–24)
Solution: standardize predictors so they’re comparable.
Skew and kurtosis: huge numbers → residuals are far from normal. For OLS, with 14M points, this is expected; p-values are not reliable for small effects, but coefficients are interpretable.

In [24]:
# rescaling the coefficients by mean and standard deviation:

from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

df_std = all_snvs_df.copy()

scaler = StandardScaler()
df_std[["CDR_dist", "local_identity", "chr_num"]] = scaler.fit_transform(
    df_std[["CDR_dist", "local_identity", "chr_num"]]
)

X = df_std[["CDR_dist", "local_identity", "chr_num"]]
X = sm.add_constant(X)
y = df_std["SNV_rate"]

model_std = sm.OLS(y, X).fit()
print(model_std.summary())


                            OLS Regression Results                            
Dep. Variable:               SNV_rate   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                 1.427e+04
Date:                Mon, 12 Jan 2026   Prob (F-statistic):               0.00
Time:                        17:52:46   Log-Likelihood:             7.3234e+07
No. Observations:            14765787   AIC:                        -1.465e+08
Df Residuals:                14765783   BIC:                        -1.465e+08
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              0.0005   4.42e-07   1238.

| Predictor      | Coef (std units  Interpretation                                                                          |
| -------------- | ---------------- | --------------------------------------------------------------------------------------- |
| CDR_dist       | -8.63e-06        | Tiny negative effect on `SNV_rate` per SD increase in CDR distance                      |
| local_identity | -3.73e-05        | Slightly larger negative effect per SD increase in local identity                       |
| chr_num        | -7.88e-05        | Largest negative effect among the three predictors per SD increase in chromosome number |

chr_num has the strongest standardized effect (in magnitude), followed by local_identity, then CDR_dist.
All effects are extremely small — consistent with the tiny R² (~0.003).
P-values are all essentially zero because your dataset is massive; statistical significance here doesn’t imply meaningful effect size.

Cond. No.: 1.15
- Much improved from 5.16e7 before standardization
- Suggests multicollinearity is no longer an issue
- Regression coefficients are numerically stable

Skew: 16.308, Kurtosis: 571.097
- Residuals are very non-normal
- With 14M points, this is expected
- p-values for OLS are not fully reliable for inference here, but coefficients are still interpretable