In [1]:
import os
import pandas as pd
import numpy as np

from pyensembl import EnsemblRelease
ensembl_data = EnsemblRelease(species='mouse')
ensembl_data.gene_name_of_gene_id

probe_ensembl_id = pd.read_csv("C:/Users/petar/Documents/USC/Research/Rong_Lab/scRNAseq/Pre-processing_clustering_annotation/GEXC_pipe/mouse4302ENSEMBL2PROBE.csv",)

In [2]:
def readRaw(
    raw_file_path: str,
    raw_type: str,
):
    """
    Read the raw GEXC data
    """
    df = pd.read_csv(os.path.join(raw_file_path), index_col=['Probeset_ID', 'Gene Symbol'])
    # find cell type and replace column name with "Sample_average"
    cell_type = df.columns[0]
    df.rename(columns={cell_type:'Sample_average'}, inplace=True)
    # assign cell_type as appended index
    df['cell_type'] = cell_type
    df.set_index('cell_type', append=True, inplace=True)
    # tranform to long, and assign corresponding colnames
    df = df.stack().reset_index().rename(columns={'level_3':'Sample', 0:'value'})
    df['data_type'] = raw_type
    return df

def readAllRaw(
    raw_path: str,
):  
    dfs = []
    for raw_type in ['HSPC_simple_young']:
        raw_type_path = os.path.join(raw_path, raw_type)
        for fil in os.listdir(raw_type_path):
            if not fil.endswith('.csv'):
                continue
            raw_file_path = os.path.join(raw_type_path, fil)
            df = readRaw(raw_file_path, raw_type)
            dfs.append(df)
    final_df = pd.concat(dfs, axis=0)
    return final_df

def get_gene_name(
    gene_id: str,
):
    try:
        return(ensembl_data.gene_name_of_gene_id(gene_id))
    except:
        return np.nan

In [3]:
df = readAllRaw("C:/Users/petar/Documents/USC/Research/Rong_Lab/scRNAseq/Pre-processing_clustering_annotation/GEXC_pipe/raw/")

In [4]:
df = df.merge(right=probe_ensembl_id, left_on='Probeset_ID', right_on='probe_id', how='left')
df['gene_name'] = df['ensembl_id'].apply(get_gene_name)

In [5]:
df

Unnamed: 0,Probeset_ID,Gene Symbol,cell_type,Sample,value,data_type,probe_id,ensembl_id,gene_name
0,1415670_at,Copg,CLP,Sample_average,8.95,HSPC_simple_young,1415670_at,ENSMUSG00000030058,Copg1
1,1415670_at,Copg,CLP,Sample_,9.00,HSPC_simple_young,1415670_at,ENSMUSG00000030058,Copg1
2,1415670_at,Copg,CLP,Sample_.1,8.81,HSPC_simple_young,1415670_at,ENSMUSG00000030058,Copg1
3,1415670_at,Copg,CLP,Sample_.2,9.03,HSPC_simple_young,1415670_at,ENSMUSG00000030058,Copg1
4,1415671_at,Atp6v0d1,CLP,Sample_average,10.45,HSPC_simple_young,1415671_at,ENSMUSG00000013160,Atp6v0d1
...,...,...,...,...,...,...,...,...,...
1415827,AFFX-TrpnX-5_at,---,MEP,Sample_.2,3.97,HSPC_simple_young,,,
1415828,AFFX-TrpnX-M_at,---,MEP,Sample_average,3.70,HSPC_simple_young,,,
1415829,AFFX-TrpnX-M_at,---,MEP,Sample_,3.75,HSPC_simple_young,,,
1415830,AFFX-TrpnX-M_at,---,MEP,Sample_.1,3.62,HSPC_simple_young,,,


In [6]:
df.cell_type.unique()

array(['CLP', 'CMP', 'Flk2n_MPP', 'Flk2p_MPP', 'GMP', 'HSC', 'MEP'],
      dtype=object)

In [7]:
df.to_csv("C:/Users/petar/Documents/USC/Research/Rong_Lab/scRNAseq/Pre-processing_clustering_annotation/GEXC_pipe/HSPC_simple_young_all_preprocessed.csv", index=False)