In [9]:
library(biomaRt)
library (EDASeq)

source(here::here('scripts/plier_util.R'))

counts_matrix=here::here('data/GSE151282/GSE151282_Raw_gene_counts_matrix.txt')

## TPM Normalization Process

TPM (Transcripts Per Kilobase Million) is a method for normalizing RNA sequencing data. It helps in comparing gene expression levels across different samples. The steps to compute TPM are as follows:

1. **Load the Data**: Import your gene count data into R.
2. **Calculate Gene Lengths**: Obtain or calculate the length of each gene in kilobases. This data is necessary for the normalization process.
3. **Compute Scaled Reads**: Divide each gene's read count by its length in kilobases to account for gene length and get scaled reads.
4. **Sum Scaled Reads**: Calculate the sum of all the scaled reads across all genes in a sample. This sum will be used to normalize the read counts so that they are comparable across samples.
5. **Calculate TPM**: Normalize each gene's scaled reads by the sum of scaled reads across all genes and multiply by 1,000,000. This final step adjusts for the total amount of transcript in each sample, allowing for comparisons across samples.


In [10]:
# Read the data using read_delim from the readr package
gene_counts <- read.table(counts_matrix, header = TRUE, sep = "\t", check.names = FALSE)

# Display the first few rows of the data frame
head(gene_counts)

Unnamed: 0_level_0,Geneid,GeneSymbol,A2_T21,A1_T21,B1_N,B2_N,A3_T21,A4_T21,B4_N,B3_N
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,ENSG00000223972,DDX11L1,0,1,3,0,0,1,3,1
2,ENSG00000227232,WASH7P,12,56,21,6,1,1,11,3
3,ENSG00000278267,MIR6859-1,0,2,3,1,0,0,2,1
4,ENSG00000243485,MIR1302-2HG,0,0,0,0,0,0,0,0
5,ENSG00000284332,MIR1302-2,0,0,0,0,0,0,0,0
6,ENSG00000237613,FAM138A,0,0,0,1,0,0,0,0


In [11]:
library(biomaRt)
ensembl_list <- gene_counts$Geneid
human <- useMart("ensembl", dataset="hsapiens_gene_ensembl")
gene_coords=getBM(attributes=c("hgnc_symbol","ensembl_gene_id", "start_position","end_position"), filters="ensembl_gene_id", values=ensembl_list, mart=human)
gene_coords$size=gene_coords$end_position - gene_coords$start_position

In [12]:
gene_coords=gene_coords[c('ensembl_gene_id', 'size')]
colnames(gene_coords) = c('Geneid', 'size')
head(gene_coords)

Unnamed: 0_level_0,Geneid,size
Unnamed: 0_level_1,<chr>,<int>
1,ENSG00000000457,44636
2,ENSG00000000460,192073
3,ENSG00000000938,23121
4,ENSG00000000971,100722
5,ENSG00000001460,59935
6,ENSG00000001461,57174


In [13]:
m_gene_counts = dplyr::left_join(gene_counts, gene_coords)
m_gene_counts <- m_gene_counts[!duplicated(m_gene_counts["GeneSymbol"]),]
rownames(m_gene_counts) = m_gene_counts$GeneSymbol
m_gene_counts = na.omit(m_gene_counts)
head(m_gene_counts)

[1m[22mJoining with `by = join_by(Geneid)`


Unnamed: 0_level_0,Geneid,GeneSymbol,A2_T21,A1_T21,B1_N,B2_N,A3_T21,A4_T21,B4_N,B3_N,size
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
DDX11L1,ENSG00000223972,DDX11L1,0,1,3,0,0,1,3,1,1660
WASH7P,ENSG00000227232,WASH7P,12,56,21,6,1,1,11,3,10190
MIR6859-1,ENSG00000278267,MIR6859-1,0,2,3,1,0,0,2,1,67
MIR1302-2HG,ENSG00000243485,MIR1302-2HG,0,0,0,0,0,0,0,0,1555
MIR1302-2,ENSG00000284332,MIR1302-2,0,0,0,0,0,0,0,0,137
FAM138A,ENSG00000237613,FAM138A,0,0,0,1,0,0,0,0,1527


In [14]:
m_gene_counts_sub = subset(m_gene_counts, select = -c(Geneid, GeneSymbol, size))
head(m_gene_counts_sub)

Unnamed: 0_level_0,A2_T21,A1_T21,B1_N,B2_N,A3_T21,A4_T21,B4_N,B3_N
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
DDX11L1,0,1,3,0,0,1,3,1
WASH7P,12,56,21,6,1,1,11,3
MIR6859-1,0,2,3,1,0,0,2,1
MIR1302-2HG,0,0,0,0,0,0,0,0
MIR1302-2,0,0,0,0,0,0,0,0
FAM138A,0,0,0,1,0,0,0,0


In [15]:
tpm <- function(counts,len) {
  x <- counts/len
  return(t(t(x)*1e6/colSums(x)))
}

gene_length = m_gene_counts$size
tpm_gene_counts = tpm(m_gene_counts_sub, gene_length)
tpm_gene_counts = data.frame(tpm_gene_counts)
head(tpm_gene_counts)

Unnamed: 0_level_0,A2_T21,A1_T21,B1_N,B2_N,A3_T21,A4_T21,B4_N,B3_N
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
DDX11L1,0.0,0.1562536,0.4171173,0.0,0.0,0.08917092,0.2520165,0.09060845
WASH7P,0.3952496,1.4254501,0.4756529,0.1417166,0.0147832,0.01452637,0.1505339,0.04428165
MIR6859-1,0.0,7.7427165,10.3345477,3.5922695,0.0,0.0,4.1626608,2.24492569
MIR1302-2HG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIR1302-2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM138A,0.0,0.0,0.0,0.1576176,0.0,0.0,0.0,0.0


In [16]:
tpm_gene_counts = as.matrix(tpm_gene_counts)

In [36]:
multiplier_z = readRDS('/home/msubirana/Documents/pivlab/plier_recount3/data/multiplier/multiplier_model_z.rds')
multiplier_summary = readRDS('/home/msubirana/Documents/pivlab/plier_recount3/data/multiplier/multiplier_model_summary.rds')
multiplier_b = readRDS('/home/msubirana/Documents/pivlab/plier_recount3/data/multiplier/multiplier_model_b.rds')

multiplier_model = list('Z'=multiplier_z, 'L2'=multiplier_summary$`LV index`, 'B'=multiplier_b)

In [37]:
str(multiplier_summary)

'data.frame':	2157 obs. of  5 variables:
 $ pathway : chr  "KEGG_LYSINE_DEGRADATION" "REACTOME_MRNA_SPLICING" "MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX" "KEGG_DNA_REPLICATION" ...
 $ LV index: chr  "1" "1" "1" "1" ...
 $ AUC     : num  0.388 0.733 0.681 0.549 0.639 ...
 $ p-value : num  8.66e-01 4.77e-05 1.63e-03 3.12e-01 2.17e-02 ...
 $ FDR     : num  0.956005 0.000582 0.011366 0.539951 0.083739 ...


In [38]:
ordered_tpm_gene_counts = GetOrderedRowNorm(tpm_gene_counts, multiplier_model)
head(ordered_tpm_gene_counts)

Unnamed: 0,A2_T21,A1_T21,B1_N,B2_N,A3_T21,A4_T21,B4_N,B3_N
GAS6,0.38525105,2.335843,-0.2671489,-0.3486633,-0.7588974,-0.5696715,-0.4023682,-0.3743446
MMP14,0.02997315,2.304308,0.3856067,-0.3739623,-0.6623636,-0.6667964,-0.5238539,-0.4929113
DSP,-0.74662704,2.328754,-0.3691698,-0.5498219,-0.6850377,-0.2624724,0.1085772,0.1757974
MARCKSL1,0.44470145,2.178579,0.3538959,-0.254194,-0.7716732,-0.6781772,-0.6235204,-0.6496117
SPARC,0.48548818,1.550855,0.4871936,0.9012518,-1.1533147,-1.1079444,-0.3881671,-0.7753627
CTSD,0.79732434,1.986567,0.179082,0.1025432,-0.8844978,-0.7575774,-0.6574565,-0.7659845


In [39]:
head(multiplier_z)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
GAS6,0.0,0,0.03943774,0,0.05047625,0.0,0.0,0.0,0.5909494,0.0,⋯,0.0501251,0.0,0.033407371,0.0,0.0,0.0059633917,0.34736209,0,0.0,0.0
MMP14,0.0,0,0.0,0,0.07007159,0.0,0.0,0.004904131,1.7201788,2.42359463,⋯,0.0,0.0,0.001007286,0.0,0.03574724,0.0,0.0,0,0.01497801,0.0
DSP,0.0,0,0.0,0,0.0,0.04169683,0.0,0.005718149,0.0,0.0,⋯,0.02085321,0.0,0.0,0.0,0.0,0.0057744399,0.0,0,0.0,0.41640455
MARCKSL1,0.3052117,0,0.0,0,0.0,0.0,0.0,0.0,0.1618435,0.14947148,⋯,0.02713418,0.05271997,0.0,0.03018947,0.06088351,0.0,0.0,0,0.0,0.44847996
SPARC,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.01401441,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.06777859,0,0.12241734,0.0626649
CTSD,0.2778532,0,0.06459781,0,0.0,0.0,0.4694809,0.099949232,0.9717295,0.43334911,⋯,0.0,0.01907577,0.008904815,0.0,0.0,0.0007222884,0.0,0,0.0,0.06193245


In [40]:
exprs.mat=ordered_tpm_gene_counts
plier.model=multiplier_model
require(PLIER)

In [41]:
ord.rownorm <- GetOrderedRowNorm(exprs.mat, plier.model) 
head(ord.rownorm)

Unnamed: 0,A2_T21,A1_T21,B1_N,B2_N,A3_T21,A4_T21,B4_N,B3_N
GAS6,0.38525105,2.335843,-0.2671489,-0.3486633,-0.7588974,-0.5696715,-0.4023682,-0.3743446
MMP14,0.02997315,2.304308,0.3856067,-0.3739623,-0.6623636,-0.6667964,-0.5238539,-0.4929113
DSP,-0.74662704,2.328754,-0.3691698,-0.5498219,-0.6850377,-0.2624724,0.1085772,0.1757974
MARCKSL1,0.44470145,2.178579,0.3538959,-0.254194,-0.7716732,-0.6781772,-0.6235204,-0.6496117
SPARC,0.48548818,1.550855,0.4871936,0.9012518,-1.1533147,-1.1079444,-0.3881671,-0.7753627
CTSD,0.79732434,1.986567,0.179082,0.1025432,-0.8844978,-0.7575774,-0.6574565,-0.7659845


In [42]:
z.mat <- plier.model$Z
head(z.mat)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
GAS6,0.0,0,0.03943774,0,0.05047625,0.0,0.0,0.0,0.5909494,0.0,⋯,0.0501251,0.0,0.033407371,0.0,0.0,0.0059633917,0.34736209,0,0.0,0.0
MMP14,0.0,0,0.0,0,0.07007159,0.0,0.0,0.004904131,1.7201788,2.42359463,⋯,0.0,0.0,0.001007286,0.0,0.03574724,0.0,0.0,0,0.01497801,0.0
DSP,0.0,0,0.0,0,0.0,0.04169683,0.0,0.005718149,0.0,0.0,⋯,0.02085321,0.0,0.0,0.0,0.0,0.0057744399,0.0,0,0.0,0.41640455
MARCKSL1,0.3052117,0,0.0,0,0.0,0.0,0.0,0.0,0.1618435,0.14947148,⋯,0.02713418,0.05271997,0.0,0.03018947,0.06088351,0.0,0.0,0,0.0,0.44847996
SPARC,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.01401441,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.06777859,0,0.12241734,0.0626649
CTSD,0.2778532,0,0.06459781,0,0.0,0.0,0.4694809,0.099949232,0.9717295,0.43334911,⋯,0.0,0.01907577,0.008904815,0.0,0.0,0.0007222884,0.0,0,0.0,0.06193245


In [52]:
head(plier.model$L2)

In [54]:
6 * diag(ncol(z.mat)) 

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
6,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,6,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,6,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,6,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,6,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,6,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,6,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,6,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,6,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,6,⋯,0,0,0,0,0,0,0,0,0,0


In [43]:
exprs.new.b <-
solve(t(z.mat) %*% z.mat + plier.model$L2 * diag(ncol(z.mat))) %*%
t(z.mat) %*% ord.rownorm

head(exprs.new.b)

ERROR: Error in h(simpleError(msg, call)): error in evaluating the argument 'a' in selecting a method for function 'solve': non-numeric argument to binary operator


In [None]:
rownames(exprs.new.b) <- rownames(plier.model$B)
head(exprs.new.b)

In [28]:
class(ordered_tpm_gene_counts)

In [35]:
GetNewDataB(ordered_tpm_gene_counts, multiplier_model)

ERROR: Error in h(simpleError(msg, call)): error in evaluating the argument 'a' in selecting a method for function 'solve': non-numeric argument to binary operator


In [126]:
head(ordered_tpm)
dim(ordered_tpm)

Unnamed: 0_level_0,A2_T21,A1_T21,B1_N,B2_N,A3_T21,A4_T21,B4_N,B3_N
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
GAS6,0.06168728,0.1668543,0.02651284,0.02211796,0.0,0.0102022,0.01922243,0.02073334
MMP14,0.13445749,0.4572055,0.18492501,0.0771355,0.036208833,0.03557978,0.05586457,0.0602556
DSP,0.0,0.1657715,0.02034599,0.01060834,0.003319835,0.02609727,0.04609787,0.04972123
MARCKSL1,19.76784572,45.6137852,18.41425891,9.3498043,1.636032263,3.02972558,3.84446239,3.45553516
SPARC,12.38569146,19.3741441,12.39687818,15.11296209,1.635691856,1.93330587,6.65480438,4.11493084
CTSD,14.50878547,23.9391484,9.60629478,8.99936301,1.172401248,2.17884527,2.97277575,2.11217863


In [128]:
multiplier_z=subset(tmp_multiplier_z, select = -c(rowname))
head(multiplier_z)
dim(multiplier_z)

ERROR: Error in eval(substitute(select), nl, parent.frame()): object 'rowname' not found


## Projections
https://github.com/pivlab/phenoplier/blob/main/nbs/05_projections/015_05-multiplier-emerge.ipynb

In [113]:
l2 = 45.5

In [114]:
zcov <- t(as.matrix(multiplier_z)) %*% as.matrix(multiplier_z)
dim(zcov)

In [115]:
l2mat <- l2 * diag(ncol(as.matrix(multiplier_z)))
dim(l2mat)

In [116]:
zinv <- solve(zcov + l2mat)

In [117]:
head(ordered_tpm)

Unnamed: 0_level_0,A2_T21,A1_T21,B1_N,B2_N,A3_T21,A4_T21,B4_N,B3_N
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
GAS6,0.06168728,0.1668543,0.02651284,0.02211796,0.0,0.0102022,0.01922243,0.02073334
MMP14,0.13445749,0.4572055,0.18492501,0.0771355,0.036208833,0.03557978,0.05586457,0.0602556
DSP,0.0,0.1657715,0.02034599,0.01060834,0.003319835,0.02609727,0.04609787,0.04972123
MARCKSL1,19.76784572,45.6137852,18.41425891,9.3498043,1.636032263,3.02972558,3.84446239,3.45553516
SPARC,12.38569146,19.3741441,12.39687818,15.11296209,1.635691856,1.93330587,6.65480438,4.11493084
CTSD,14.50878547,23.9391484,9.60629478,8.99936301,1.172401248,2.17884527,2.97277575,2.11217863


In [120]:
newb <- zinv %*% t(as.matrix(multiplier_z)) %*% as.matrix(ordered_tpm)

In [121]:
head(newb)

Unnamed: 0,A2_T21,A1_T21,B1_N,B2_N,A3_T21,A4_T21,B4_N,B3_N
X1,14.020302,17.5954569,14.90392,10.6691668,15.297784,9.101102,11.445383,8.570735
X2,27.320242,34.9233072,20.6594,13.5925847,23.186054,13.405826,16.619541,10.035201
X3,1.081143,0.9214704,1.36611,0.9131165,2.253863,1.094393,1.462922,1.263021
X4,9.177443,10.6876404,10.04862,8.2134004,11.491121,6.335041,7.970505,4.471863
X5,-11.044831,-12.4719673,-13.52302,-10.5260271,-18.766835,-9.698129,-11.870208,-8.364079
X6,-15.030297,-12.7201054,-13.4136,-9.016606,-21.21063,-10.512147,-13.949271,-5.23102


In [105]:
dim(newb)

In [106]:
dim(ordered_tpm)

In [1]:
from multiplier import MultiplierProjection

ERROR: Error in parse(text = x, srcfile = src): <text>:1:6: unexpected symbol
1: from multiplier
         ^


In [None]:
mproj = MultiplierProjection()
smultixcan_into_multiplier = mproj.transform(smultixcan_results)