# First step

making count and TPM matrix using RSEM files.

you need to keep all the RSEM files in folder near to this script called "rsemFiles" and then run this script

In [1]:
import os
import pandas as pd
import collections
import numpy as np
import argparse
from pandas.api.types import CategoricalDtype
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from bioinfokit.visuz import cluster

In [2]:
# get RSEM files to prepare RNA matrixs(TPM and count)
def prepare_data_RNA_seq(RSEM_directory):
    RSEM_filenames = os.listdir(RSEM_directory)
    geneID = None
    exp = []
    count = []

    for RSEM_file in RSEM_filenames:
        print(RSEM_file)
        metadata_RSEM = RSEM_file.split("_")

        file = pd.read_csv(RSEM_directory + '/' + RSEM_file, sep='\t').values
        if geneID is None:
            geneID = list(file[:, 0])
        elif collections.Counter(geneID) != collections.Counter(file[:, 0]):
            print("order of gene ID is not match!")

        exp.append(list(file[:, 5]))
        count.append(list(file[:, 4]))

    expData = pd.DataFrame(np.array(exp).transpose(), columns=RSEM_filenames, index=geneID)
    countData = pd.DataFrame(np.array(count).transpose(), columns=RSEM_filenames, index=geneID)

    expData.to_csv("expressionList.csv")
    countData.to_csv("countMatrix.csv")

In [3]:
RSEM_directory = "rsemFiles"
prepare_data_RNA_seq(RSEM_directory)

hipp_M_5XFADHEMI_12mo_B2_12802_S40_rsem.genes.results
hipp_M_5xFADWT_12mo_B2_12435_S35_rsem.genes.results
hipp_F_Bin1_HO_12mo_B1_13033_S21_rsem.genes.results
hipp_F_5xFADHEMI_Bin1_HO_4mo_B2_13019_S32_rsem.genes.results
hipp_F_5xFADWT_12mo_B1_12441_S14_rsem.genes.results
hipp_M_5xFADHEMI_Bin1_HO_4mo_B2_13044_S37_rsem.genes.results
hipp_M_5xFADWT_12mo_B1_12419_S3_rsem.genes.results
hipp_F_5xFADWT_12mo_B2_12426_S46_rsem.genes.results
hipp_F_5xFADHEMI_4mo_B1_11616_S17_rsem.genes.results
hipp_M_5xFADWT_4mo_B1_11624_S30_rsem.genes.results
hipp_M_5xFADHEMI_4mo_B1_11631_S24_rsem.genes.results
hipp_M_5XFADHEMI_12mo_B1_12424_S17_rsem.genes.results
hipp_F_5xFADHEMI_Bin1_HO_12mo_B2_12981_S28_rsem.genes.results
hipp_M_5xFADHEMI_Bin1_HO_4mo_B1_13034_S3_rsem.genes.results
hipp_M_5xFADHEMI_4mo_B1_11622_S23_rsem.genes.results
hipp_M_5xFADWT_4mo_B1_11636_S31_rsem.genes.results
hipp_F_Bin1_HO_12mo_B2_13033_S37_rsem.genes.results
hipp_M_5xFADHEMI_Bin1_HO_12mo_B1_12977_S8_rsem.genes.results
hipp_M_Bin1_HO_

In [4]:
expressionList = pd.read_csv('expressionList.csv', index_col=0)
countMatrix = pd.read_csv("countMatrix.csv", index_col=0)

expressionList.columns = expressionList.columns.str.replace('BIN1HO', 'Bin1HO')
expressionList.columns = expressionList.columns.str.replace('Bin1_HO', 'Bin1HO')
expressionList.columns = expressionList.columns.str.replace('M_Bin1HO', 'M_5xFADWT:Bin1HO')
expressionList.columns = expressionList.columns.str.replace('F_Bin1HO', 'F_5xFADWT:Bin1HO')
expressionList.columns = expressionList.columns.str.replace('5XFADHEMI', '5xFADHEMI')
expressionList.columns = expressionList.columns.str.replace('5xFADHEMI_Bin1HO', '5xFADHEMI:Bin1HO')

countMatrix.columns = expressionList.columns

expressionList.to_csv("expressionList.csv")
countMatrix.to_csv("countMatrix.csv")

## make dataTrait

In [5]:
## create metadata
expressionList = pd.read_csv('expressionList.csv', index_col=0)

metadata = []

for RSEM_file in expressionList.columns:
    metadata_RSEM = RSEM_file.split("_")
    metadata.append([RSEM_file, metadata_RSEM[3], metadata_RSEM[0], metadata_RSEM[1], metadata_RSEM[2]])

RNA_metadata = pd.DataFrame(metadata, columns=['file name', 'Time point', 'Tissue', 'Sex', 'Group'])
RNA_metadata.to_csv("experimentList.csv")
RNA_metadata

Unnamed: 0,file name,Time point,Tissue,Sex,Group
0,hipp_M_5xFADHEMI_12mo_B2_12802_S40_rsem.genes....,12mo,hipp,M,5xFADHEMI
1,hipp_M_5xFADWT_12mo_B2_12435_S35_rsem.genes.re...,12mo,hipp,M,5xFADWT
2,hipp_F_5xFADWT:Bin1HO_12mo_B1_13033_S21_rsem.g...,12mo,hipp,F,5xFADWT:Bin1HO
3,hipp_F_5xFADHEMI:Bin1HO_4mo_B2_13019_S32_rsem....,4mo,hipp,F,5xFADHEMI:Bin1HO
4,hipp_F_5xFADWT_12mo_B1_12441_S14_rsem.genes.re...,12mo,hipp,F,5xFADWT
...,...,...,...,...,...
80,hipp_F_5xFADWT:Bin1HO_12mo_B1_13030_S13_rsem.g...,12mo,hipp,F,5xFADWT:Bin1HO
81,hipp_F_5xFADHEMI_12mo_B2_12452_S42_rsem.genes....,12mo,hipp,F,5xFADHEMI
82,hipp_F_5xFADHEMI:Bin1HO_12mo_B2_13035_S43_rsem...,12mo,hipp,F,5xFADHEMI:Bin1HO
83,hipp_M_5xFADWT:Bin1HO_4mo_B1_13037_S13_rsem.ge...,4mo,hipp,M,5xFADWT:Bin1HO


## Sort TPM and count matrix

In [6]:
RNA_metadata = pd.read_csv("experimentList.csv", index_col=0)

cat_time_order = CategoricalDtype(['4mo', '12mo'], ordered=True)
RNA_metadata['Time point'] = RNA_metadata['Time point'].astype(cat_time_order)

RNA_metadata.sort_values(['Tissue', 'Time point', 'Sex', 'Group'], inplace=True)
RNA_metadata.reset_index(drop=True, inplace=True)
RNA_metadata.to_csv("experimentList_sorted.csv")

expressionList = pd.read_csv('expressionList.csv', index_col=0)
countMatrix = pd.read_csv("countMatrix.csv", index_col=0)

expressionList = expressionList[RNA_metadata['file name'].tolist()]
countMatrix = countMatrix[RNA_metadata['file name'].tolist()]

expressionList.to_csv("expressionList_sorted.csv")
countMatrix.to_csv("countMatrix_sorted.csv")

## Add gene name and gene biotype

in this section, we will add gene name and gene biotype to our count and TPM matrix

In [7]:
geneList = pd.read_csv('../genelist.vM21.annotation.tsv', sep='\t', index_col=0)

expressionList = pd.read_csv('expressionList_sorted.csv', index_col=0)
expressionList = pd.concat([expressionList, geneList], axis=1)
cols = expressionList.columns.tolist()
cols = cols[-2:] + cols[:-2]
expressionList = expressionList[cols]
expressionList.to_csv("expressionList_sorted.csv")

countMatrix = pd.read_csv('countMatrix_sorted.csv', index_col=0)
countMatrix = pd.concat([countMatrix, geneList], axis=1)
cols = countMatrix.columns.tolist()
cols = cols[-2:] + cols[:-2]
countMatrix = countMatrix[cols]
countMatrix.to_csv("countMatrix_sorted.csv")

## subset ployA genes
subseting genes to be only protein_coding or lincRNA.

In [8]:
expressionList = pd.read_csv('expressionList_sorted.csv', index_col=0)
expressionList = expressionList[np.logical_or(expressionList.gene_type == "protein_coding",
                                              expressionList.gene_type == "lincRNA")]
expressionList.to_csv("expressionList_sorted_polyA.csv")

countMatrix = pd.read_csv('countMatrix_sorted.csv', index_col=0)
countMatrix = countMatrix[np.logical_or(countMatrix.gene_type == "protein_coding",
                                        countMatrix.gene_type == "lincRNA")]
countMatrix.to_csv("countMatrix_sorted_polyA.csv")