In [3]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
TF1_PATH = Path("../src/TF-Matrix.chr1.txt")
TF2_PATH = Path("../src/TF-Matrix.chr2.txt")

def load_df(path):
    return pd.read_csv(path, sep='\t')

TF1_df = load_df(TF1_PATH)
TF2_df = load_df(TF2_PATH)

In [5]:
TF1_df

Unnamed: 0,cCRE,CTCF,ASH2L,ATF2,ATF7,BATF,BCL11A,BCL3,BCLAF1,BHLHE40,...,ZNF143,ZNF207,ZNF217,ZNF24,ZNF384,ZNF592,ZNF687,ZSCAN29,ZZZ3,Unnamed: 128
0,EH38E0078724,1.141900,0.966854,2.188890,0.691002,6.795400,2.392920,1.625600,3.23174,1.45784,...,0.637517,0.972212,0.867020,0.830651,1.014650,1.541170,3.601490,0.846217,0.266719,
1,EH38E1070274,0.608820,1.138660,0.915075,1.700300,0.223262,2.708510,0.144813,0.78011,1.25539,...,0.421721,1.406400,0.676739,0.946100,0.756368,0.684674,1.092570,1.008030,0.232510,
2,EH38E1073973,4.955210,1.282230,3.220930,6.072330,2.783780,3.060970,3.536940,1.62246,1.47039,...,8.425570,2.239830,1.143220,3.676670,1.515720,1.158590,1.108030,0.869008,0.592706,
3,EH38E1310242,24.255200,0.384366,1.621320,1.064170,0.000000,0.998662,0.261641,1.82777,1.82395,...,5.332220,1.069310,1.462700,1.392500,3.341560,1.736810,0.991999,0.743735,0.238827,
4,EH38E1310413,44.447900,1.522690,2.197080,1.031420,1.892720,0.925301,0.407060,1.90325,7.83313,...,7.185560,0.986074,1.299610,1.206740,18.810900,1.468370,0.896588,0.827896,0.151877,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10402,EH38E2660301,0.418765,2.646010,6.130010,1.844300,2.420260,4.004380,2.040840,2.74046,9.27435,...,0.989978,2.051560,0.863267,0.998650,2.344990,0.839653,0.858820,0.825643,0.016164,
10403,EH38E2660302,1.118970,2.442840,4.768840,4.750300,3.472360,2.211920,1.392420,2.64331,9.05229,...,2.707270,2.161220,1.123220,0.823765,1.281080,0.889253,1.874600,1.399910,0.091060,
10404,EH38E2660307,3.437500,1.878020,2.654980,1.020300,0.659395,2.733300,1.610770,2.62705,2.00703,...,1.154600,1.839910,1.009360,0.946024,1.731230,0.890982,1.348500,0.898601,0.123077,
10405,EH38E2660308,0.831852,2.715510,5.801670,1.637200,1.570040,2.936900,1.758500,2.56500,5.76192,...,5.970050,1.876190,0.660934,1.028620,0.872933,0.785143,2.413380,0.797513,0.039656,


In [6]:
#Separate features from target and remove last column with NAs
X = TF1_df.iloc[:, 2:128]
y = TF1_df.iloc[:,1]

In [7]:
# Scale the features using the StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Normalize the features using the MinMaxScaler
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X)
X

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Create a linear regression object
lr = LinearRegression()

# Train the model using the training sets
lr.fit(X_train, y_train)

In [9]:
# Make predictions on the test set
y_pred = lr.predict(X_test)

In [10]:
# Compute MSE and R^2 on the test set
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean squared error (MSE): {mse:.4f}")
print(f"Coefficient of determination (R^2): {r2:.4f}")

Mean squared error (MSE): 43.0143
Coefficient of determination (R^2): 0.7342


In [19]:
# Get the absolute coefficients of the model
abs_coef = np.abs(lr.coef_)

# Get the indices of the top 5 features
top_5_indices = abs_coef.argsort()[::-1][:5]

# Extract the names of the features
feature_names = TF1_df.iloc[:,0]

# Get the names of the top 5 features
top_5_features = feature_names[top_5_indices]

print("Top 5 most important features:")
print(top_5_features)

Top 5 most important features:
106    EH38E1347793
31     EH38E1345376
98     EH38E1347358
118    EH38E1348010
92     EH38E1346678
Name: cCRE, dtype: object


In [24]:
lr.coef_



array([-4.45881047e-01, -2.57145341e-02,  3.95591203e-04, -3.54869152e-02,
       -9.76741159e-02, -2.39783288e-01,  9.64346407e-02,  4.43493603e-02,
        5.20264681e-02, -9.04991668e-02,  4.40347378e-02,  4.40921559e-01,
        4.87865660e-02,  5.10170741e-02, -1.84109653e-01,  4.11984197e-02,
        5.91248921e-03,  8.56397203e-02, -2.60427849e-01, -7.10334690e-01,
        4.98378188e-02, -9.13018615e-02,  3.63116446e-02,  2.50960990e-02,
       -1.03085512e-01,  5.42275935e-02,  4.03159864e-02, -1.54404440e-01,
        1.79954684e-01, -2.17036066e-01,  5.81804893e-02,  2.49910048e+00,
        5.30874431e-01, -3.52332678e-02, -4.19637222e-01, -5.11024992e-02,
       -1.97405789e-01, -1.23844992e-01,  3.19023454e-01, -4.18572258e-01,
        4.16940393e-01, -1.42497508e-01,  2.71777708e-01, -1.80639600e-01,
       -5.09755092e-02,  1.71079350e-01, -3.05866275e-01, -4.11791333e-01,
       -5.04825554e-01, -6.16901493e-02,  7.92277859e-02, -7.36762427e-02,
       -3.96467832e-02, -