# SysGen Functional Gene Embedding Project

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
import xgboost as xgb
import plotnine as p9
import scipy.stats

## Data Preparation

### Load Covariates

In [2]:
def build_control_covariates(metadata):
    genesize = metadata.NPARAM.values.astype(float)
    genedensity = metadata.NPARAM.values/metadata.NSNPS.values
    inverse_mac = 1.0/metadata.MAC.values
    cov = np.stack((genesize, np.log(genesize), genedensity, np.log(genedensity), inverse_mac, np.log(inverse_mac)), axis=1)
    return cov

def munge_sigma(magma_gene_raw):
    f = open(magma_gene_raw)
    lines = list(f)[2:]
    lines = [np.asarray(line.strip('\n').split(' ')) for line in lines]
    sigmas = []
    gene_metadata = []
    gene_lists = []
    for chrom in range(1,23):
        chr_start = min(np.where([int(line[1])==chrom for line in lines])[0])
        chr_end = max(np.where([int(line[1])==chrom for line in lines])[0])
        lines_chr = lines[chr_start:chr_end+1]
        n_genes = len(lines_chr)
        sigma_chr = np.zeros([n_genes, n_genes])
        gene_NSNPs = np.zeros(n_genes)
        gene_NPARAM = np.zeros(n_genes)
        gene_MAC = np.zeros(n_genes)
        for i in range(n_genes):
            line = lines_chr[i]
            gene_NSNPs[i] = line[4]
            gene_NPARAM[i] = line[5]
            gene_MAC[i] = line[7]
            if line.shape[0] > 9:
                gene_corrs = np.asarray([float(c) for c in line[9:]])
                sigma_chr[i, i-gene_corrs.shape[0]:i] = gene_corrs
        sigma_chr = sigma_chr+sigma_chr.T+np.identity(n_genes)
        sigmas.append(sigma_chr)
        gene_metadata_chr = pd.DataFrame(data={'NSNPS': gene_NSNPs, 'NPARAM': gene_NPARAM, 'MAC': gene_MAC})
        gene_metadata.append(gene_metadata_chr)
        gene_list_chr = [line[0] for line in lines_chr]
        gene_lists.append(gene_list_chr)
    return sigmas, gene_metadata, gene_lists

In [3]:
sigmas, metadata, gene_lists = munge_sigma('../data/HDL_cholesterol.genes.raw')

In [4]:
# create covariates from pops
covariates = []
for i in range(0, 22):
    #print(i)
    covariates.append(pd.DataFrame(build_control_covariates(metadata[i]),
                                   index = gene_lists[i],
                                   columns = ['genesize',
                                              'log_genesize',
                                              'genedensity',
                                              'log_genedensity',
                                              'inverse_mac',
                                              'log_inverse_mac'])
                      )
covariates = pd.concat(covariates)

In [5]:
covariates

Unnamed: 0,genesize,log_genesize,genedensity,log_genedensity,inverse_mac,log_inverse_mac
ENSG00000187634,21.0,3.044522,0.230769,-1.466337,0.013156,-4.330878
ENSG00000188976,11.0,2.397895,0.166667,-1.791759,0.019236,-3.950951
ENSG00000187961,7.0,1.945910,0.189189,-1.665008,0.016599,-4.098390
ENSG00000187583,19.0,2.944439,0.387755,-0.947381,0.011969,-4.425457
ENSG00000187642,7.0,1.945910,0.250000,-1.386294,0.006525,-5.032071
...,...,...,...,...,...,...
ENSG00000008735,14.0,2.639057,0.212121,-1.550597,0.007717,-4.864268
ENSG00000100299,12.0,2.484907,0.333333,-1.098612,0.008163,-4.808111
ENSG00000251322,35.0,3.555348,0.207101,-1.574551,0.006002,-5.115722
ENSG00000100312,10.0,2.302585,0.277778,-1.280934,0.005423,-5.217048


### Load Embeddings

In [6]:
emb_path = '../data/Omics_d256.tsv'

In [7]:
# load embedding
emb = pd.read_csv(emb_path, sep = "\t").set_index("gene_id")
emb

Unnamed: 0_level_0,FACT_EMB_0,FACT_EMB_1,FACT_EMB_2,FACT_EMB_3,FACT_EMB_4,FACT_EMB_5,FACT_EMB_6,FACT_EMB_7,FACT_EMB_8,FACT_EMB_9,...,FACT_EMB_246,FACT_EMB_247,FACT_EMB_248,FACT_EMB_249,FACT_EMB_250,FACT_EMB_251,FACT_EMB_252,FACT_EMB_253,FACT_EMB_254,FACT_EMB_255
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,-1.193790,-0.038485,0.477347,-1.355883,-1.283844,0.577406,-0.085741,-0.141601,0.929133,1.186045,...,1.202038,-0.015473,0.732149,-0.470897,0.163932,0.514095,-1.121097,0.353111,-0.375309,0.271749
ENSG00000000005,-0.678243,-0.082044,0.628727,-0.036960,-0.327301,-0.132137,-0.658378,0.382881,0.321670,0.529753,...,0.158936,1.025258,-0.944454,-0.110598,0.059320,-0.250677,-0.524779,-0.302290,0.627002,0.164144
ENSG00000000419,0.763473,0.029731,0.157848,-1.063333,-1.121445,0.855230,0.054342,0.782584,0.679390,1.211714,...,0.215326,-0.662693,0.154820,0.359877,-0.675248,-0.293355,-0.355871,0.380746,-0.834603,-0.782095
ENSG00000000457,0.500445,0.107114,0.758035,-1.097068,0.355529,0.676653,0.562473,-0.585225,0.680695,0.630335,...,-0.443879,0.226092,0.258948,0.716421,0.022804,-0.488464,-0.700852,-0.285765,0.137165,-0.220267
ENSG00000000460,0.085451,0.300662,-0.520867,-0.111104,0.116884,1.271965,0.750282,-0.769611,0.652003,0.748371,...,-0.781987,-0.538280,-1.022490,-0.261985,-0.225361,0.223187,-0.840165,1.048801,-0.219930,-0.667091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000280178,0.151255,0.066157,0.085184,-0.053143,-0.270811,0.211693,-0.401942,0.048608,0.396584,-0.216256,...,0.036106,0.029502,-0.247412,-0.039378,-0.355921,-0.143174,-0.001159,0.033741,-0.136875,-0.467921
ENSG00000280253,0.139209,0.173939,-0.025301,0.197056,-0.086932,0.248732,-0.317894,0.000547,0.264560,-0.110397,...,0.139947,-0.031823,-0.109894,0.097934,-0.243768,-0.176109,-0.002984,-0.027844,0.069011,-0.645361
ENSG00000280267,-0.799880,-0.290801,0.281187,-0.347249,-0.004650,0.019152,0.336727,0.289013,-0.390199,-0.204644,...,-0.213539,-0.293435,-0.636833,0.811885,-0.119030,0.375083,0.600217,-0.772333,0.670361,-0.510163
ENSG00000280297,0.125944,0.020628,-0.069116,-0.076246,-0.322970,0.216778,-0.389412,0.208437,0.143095,-0.213645,...,-0.069501,0.155821,-0.123458,0.270595,-0.460678,0.027504,0.098230,-0.058452,0.088184,-0.454670


### Load GWAS MAGMA Scores

In [8]:
magma = pd.read_csv('../data/HDL_cholesterol.genes.out', delim_whitespace=True)
magma

Unnamed: 0,GENE,CHR,START,STOP,NSNPS,NPARAM,N,ZSTAT,P
0,ENSG00000187634,1,860260,879955,91,21,422405,1.89430,0.029092
1,ENSG00000188976,1,879584,894689,66,11,422405,3.07660,0.001047
2,ENSG00000187961,1,895967,901095,37,7,422405,3.00680,0.001320
3,ENSG00000187583,1,901877,911245,49,19,422405,1.90960,0.028095
4,ENSG00000187642,1,910579,917497,28,7,422405,3.89010,0.000050
...,...,...,...,...,...,...,...,...,...
17968,ENSG00000008735,22,51039114,51052409,66,14,422405,1.51710,0.064622
17969,ENSG00000100299,22,51061182,51066607,36,12,422405,2.77490,0.002761
17970,ENSG00000251322,22,51112843,51171726,169,35,422405,0.62568,0.265760
17971,ENSG00000100312,22,51176624,51183762,36,10,422405,1.33790,0.090472


### Merge Data

In [9]:
magma = magma.merge(covariates, left_on = "GENE", right_index = True)

In [10]:
magma

Unnamed: 0,GENE,CHR,START,STOP,NSNPS,NPARAM,N,ZSTAT,P,genesize,log_genesize,genedensity,log_genedensity,inverse_mac,log_inverse_mac
0,ENSG00000187634,1,860260,879955,91,21,422405,1.89430,0.029092,21.0,3.044522,0.230769,-1.466337,0.013156,-4.330878
1,ENSG00000188976,1,879584,894689,66,11,422405,3.07660,0.001047,11.0,2.397895,0.166667,-1.791759,0.019236,-3.950951
2,ENSG00000187961,1,895967,901095,37,7,422405,3.00680,0.001320,7.0,1.945910,0.189189,-1.665008,0.016599,-4.098390
3,ENSG00000187583,1,901877,911245,49,19,422405,1.90960,0.028095,19.0,2.944439,0.387755,-0.947381,0.011969,-4.425457
4,ENSG00000187642,1,910579,917497,28,7,422405,3.89010,0.000050,7.0,1.945910,0.250000,-1.386294,0.006525,-5.032071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17968,ENSG00000008735,22,51039114,51052409,66,14,422405,1.51710,0.064622,14.0,2.639057,0.212121,-1.550597,0.007717,-4.864268
17969,ENSG00000100299,22,51061182,51066607,36,12,422405,2.77490,0.002761,12.0,2.484907,0.333333,-1.098612,0.008163,-4.808111
17970,ENSG00000251322,22,51112843,51171726,169,35,422405,0.62568,0.265760,35.0,3.555348,0.207101,-1.574551,0.006002,-5.115722
17971,ENSG00000100312,22,51176624,51183762,36,10,422405,1.33790,0.090472,10.0,2.302585,0.277778,-1.280934,0.005423,-5.217048


### Project Y to LY

TODO: What is this exactly?

In [11]:
def compute_Ls(sigmas, Y):
    Ls = []
    min_lambda=0
    for sigma in sigmas:
        W = np.linalg.eigvalsh(sigma)
        min_lambda = min(min_lambda, min(W))
    #Y = pd.read_table(args.gene_results+'.genes.out', delim_whitespace=True).ZSTAT.values
    ridge = abs(min_lambda)+.05+.9*max(0, np.var(Y)-1)
    for sigma in sigmas:
        sigma = sigma+ridge*np.identity(sigma.shape[0])
        L = np.linalg.cholesky(np.linalg.inv(sigma))
        Ls.append(L)
    return Ls

In [12]:
Ls = compute_Ls(sigmas, magma.ZSTAT)

def project_Y(Ls, magma_Z):
    LYs = []
    for i in range(22):
        L = Ls[i]
        magma_temp = magma.set_index("GENE").reindex(gene_lists[i]).reset_index()

        LYs.append(pd.DataFrame({"GENE": magma_temp.GENE, "LY": np.matmul(L, magma_temp.ZSTAT)}))
    return pd.concat(LYs)

def project_Y_back(Ls, res):
    LYs = []
    for i in range(22):
        L = np.linalg.inv(Ls[i])
        temp = res.set_index("GENE").reindex(gene_lists[i]).reset_index()

        LYs.append(pd.DataFrame({"GENE": temp.dropna().GENE,
                                 "pred": np.matmul(L[~temp.pred_LY.isna(), :][:, ~temp.pred_LY.isna()],
                                                   temp.dropna().pred_LY),
                                 }))
    return pd.concat(LYs)

magma = magma.merge(project_Y(Ls, magma))

In [13]:
magma

Unnamed: 0,GENE,CHR,START,STOP,NSNPS,NPARAM,N,ZSTAT,P,genesize,log_genesize,genedensity,log_genedensity,inverse_mac,log_inverse_mac,LY
0,ENSG00000187634,1,860260,879955,91,21,422405,1.89430,0.029092,21.0,3.044522,0.230769,-1.466337,0.013156,-4.330878,0.999534
1,ENSG00000188976,1,879584,894689,66,11,422405,3.07660,0.001047,11.0,2.397895,0.166667,-1.791759,0.019236,-3.950951,1.551349
2,ENSG00000187961,1,895967,901095,37,7,422405,3.00680,0.001320,7.0,1.945910,0.189189,-1.665008,0.016599,-4.098390,1.148646
3,ENSG00000187583,1,901877,911245,49,19,422405,1.90960,0.028095,19.0,2.944439,0.387755,-0.947381,0.011969,-4.425457,0.708156
4,ENSG00000187642,1,910579,917497,28,7,422405,3.89010,0.000050,7.0,1.945910,0.250000,-1.386294,0.006525,-5.032071,1.854531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17968,ENSG00000008735,22,51039114,51052409,66,14,422405,1.51710,0.064622,14.0,2.639057,0.212121,-1.550597,0.007717,-4.864268,0.512377
17969,ENSG00000100299,22,51061182,51066607,36,12,422405,2.77490,0.002761,12.0,2.484907,0.333333,-1.098612,0.008163,-4.808111,1.098611
17970,ENSG00000251322,22,51112843,51171726,169,35,422405,0.62568,0.265760,35.0,3.555348,0.207101,-1.574551,0.006002,-5.115722,0.082490
17971,ENSG00000100312,22,51176624,51183762,36,10,422405,1.33790,0.090472,10.0,2.302585,0.277778,-1.280934,0.005423,-5.217048,0.569190


### Merge Data

In [14]:
# merge with embedding
dt = magma.merge(emb, left_on = "GENE", right_on = "gene_id")

## Regression

Split by Chromosomes.

In [15]:
df = []
for chrom in range(1,23):
    reg = xgb.XGBRegressor(tree_method="hist", reg_lambda = 1000, reg_alpha = 100)
    mod = reg.fit(
        dt.query("CHR != @chrom").drop(["GENE", "CHR", "START", "STOP", "NSNPS", "NPARAM", "N", "ZSTAT", "P", "LY"], axis=1),
        dt.query("CHR != @chrom")['LY']
    )
    pred = mod.predict(
        dt.query("CHR == @chrom").drop(["GENE", "CHR", "START", "STOP", "NSNPS", "NPARAM", "N", "ZSTAT", "P", "LY"], axis=1),
    )

    df_chrom = dt.query("CHR == @chrom")[["GENE", "CHR", "START", "STOP", "NSNPS", "NPARAM", "N", "ZSTAT", "P", "LY"]]
    df_chrom['pred_LY'] = pred

    df.append(df_chrom)
    print(f"Chrom: {chrom}: R2: {scipy.stats.pearsonr(df_chrom.LY, df_chrom.pred_LY)[0]**2}")


df = pd.concat(df)
df = df.merge(project_Y_back(Ls, df))

print(f"Overall R2: {scipy.stats.pearsonr(df.ZSTAT, df.pred)[0]**2}")
print()
print("Per chrom R2:")

for i in range(1, 23):
    df_tmp = df.query("CHR == @i")

    print(scipy.stats.pearsonr(df_tmp.ZSTAT, df_tmp.pred)[0]**2)

# df.to_csv(snakemake.output.pred, sep = '\t')

Chrom: 1: R2: 0.07525334525537461
Chrom: 2: R2: 0.11041946486781463
Chrom: 3: R2: 0.07101397097248709
Chrom: 4: R2: 0.06367080705018728
Chrom: 5: R2: 0.07676319218327589
Chrom: 6: R2: 0.03075019478768777
Chrom: 7: R2: 0.053877252939014124
Chrom: 8: R2: 0.07305649129367954
Chrom: 9: R2: 0.06310958093218305
Chrom: 10: R2: 0.07321509777592132
Chrom: 11: R2: 0.05258632445117058
Chrom: 12: R2: 0.05650955600295825
Chrom: 13: R2: 0.09195255493413862
Chrom: 14: R2: 0.06392773534850946
Chrom: 15: R2: 0.06425214710222564
Chrom: 16: R2: 0.0547694050821071
Chrom: 17: R2: 0.0498257195413476
Chrom: 18: R2: 0.041166781884540996
Chrom: 19: R2: 0.05471255920661859
Chrom: 20: R2: 0.05431657493686166
Chrom: 21: R2: 0.2429877283843677
Chrom: 22: R2: 0.010847405385980464
Overall R2: 0.09524026030438075

Per chrom R2:
0.0804065433444994
0.06725134335099976
0.18181764973308354
0.07327736017959016
0.03059141850620522
0.21989611128886694
0.05037218461677569
0.0696986472443637
0.0721257922657933
0.0595016692461

## Old code

In [16]:
X = merged_data.iloc[:, 1:257]  # Columns 1 to 256 are the embedding features
y = merged_data['ZSTAT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

NameError: name 'merged_data' is not defined

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Predictions
predictions = model.predict(X_test)
predictions

In [None]:
# Evaluate Model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error: {mse}')
print(f'R2 Score: {r2}')

In [None]:
# Perform PCA
pca = PCA(n_components=10)  # Set the desired number of principal components
principal_components = pca.fit_transform(X)

In [None]:
# Create DataFrame with Principal Components
pc_columns = [f'PC{i}' for i in range(1, pca.n_components_ + 1)]
pc_df = pd.DataFrame(data=principal_components, columns=pc_columns)
pc_df

In [None]:
# Concatenate PCA components with original data
merged_data_pca = pd.concat([pc_df, y], axis=1)

# Split Data
X_pca = merged_data_pca.iloc[:, :-2]  # Exclude GENE and ZSTAT columns
y_pca = merged_data_pca['ZSTAT']
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y_pca, test_size=0.2, random_state=42)

In [None]:
# Train Linear Regression Model on PCA components
model_pca = LinearRegression()
model_pca.fit(X_train_pca, y_train_pca)

In [None]:
# Predictions using PCA components
predictions_pca = model_pca.predict(X_test_pca)
predictions_pca

In [None]:
# Evaluate Model with PCA
mse_pca = mean_squared_error(y_test_pca, predictions_pca)
r2_pca = r2_score(y_test_pca, predictions_pca)

print(f'Mean Squared Error with PCA: {mse_pca}')
print(f'R2 Score with PCA: {r2_pca}')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import StandardScaler

In [None]:
# Standardize the data
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Split Data
X_train, X_test = train_test_split(X_standardized, test_size=0.2, random_state=42)

In [None]:
# PyTorch Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_size, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Linear(input_size, encoding_dim)
        self.decoder = nn.Linear(encoding_dim, input_size)

    def forward(self, x):
        x = torch.relu(self.encoder(x))
        x = self.decoder(x)
        return x

# Train Autoencoder
def train_autoencoder(model, criterion, optimizer, data, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, data)
        loss.backward()
        optimizer.step()

        if epoch % 100 == 0:
            print(f'Epoch {epoch}/{num_epochs}, Loss: {loss.item()}')

# Create PyTorch DataLoader
train_data = torch.Tensor(X_train)
test_data = torch.Tensor(X_test)

# Parameters
input_size = X_train.shape[1]
encoding_dim = 64

# Initialize Autoencoder
autoencoder = Autoencoder(input_size, encoding_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

In [None]:
# Train Autoencoder
train_autoencoder(autoencoder, criterion, optimizer, train_data, num_epochs=500)

In [None]:
# Evaluate Autoencoder
autoencoder.eval()
encoded_data = autoencoder.encoder(test_data).detach().numpy()

In [None]:
# PCA
pca = PCA(n_components=encoding_dim)
pca.fit(X_train)
pca_data = pca.transform(X_test)

# Compare Autoencoder and PCA
mse_autoencoder = mean_squared_error(test_data.numpy(), autoencoder(test_data).detach().numpy())
mse_pca = mean_squared_error(X_test, pca.inverse_transform(pca_data))

print(f'Mean Squared Error for Autoencoder: {mse_autoencoder}')
print(f'Mean Squared Error for PCA: {mse_pca}')