In [6]:
# Standard library imports
import sys
import asyncio
import aiohttp
import aiofiles

# Related third party imports
from tqdm import tqdm
import pandas as pd

# Local application/library specific imports.
# paradoxcell
sys.path.append('/10Gdata/sjt')
import paradoxcell as pc

In [2]:
# data download from gui
path_ea_directroy = '/10Gdata/sjt/data/database/expression_atlas/condensed_plant_baseline_107'

data_ea_condensed_sdrf_where_plant_baseline = pc.DataFile(
    path_ea_directroy,
    'condensed-sdrf.tsv',
    'iso-8859-1',
    '\t'
    )

df_ea_condensed_sdrf_where_plant_baseline = data_ea_condensed_sdrf_where_plant_baseline.get_df_open()

df_ea_condensed_sdrf_where_plant_baseline.replace('organism [part','organism part',inplace=True)
df_ea_condensed_sdrf_where_plant_baseline.replace('wild type genotype','wild type',inplace=True)
df_ea_condensed_sdrf_where_plant_baseline.drop_duplicates(inplace=True)
df_ea_condensed_sdrf_where_plant_baseline.drop([3],axis=1,inplace=True)

df_ea_condensed_sdrf_where_plant_baseline.columns = ['study','run','characteristic_key','characteristic_value','etc']
df_ea_condensed_sdrf_where_plant_baseline

Unnamed: 0,study,run,characteristic_key,characteristic_value,etc
0,E-CURD-1,ERR274309,age,not available,
1,E-CURD-1,ERR274309,developmental stage,LP.04 four leaves visible stage,http://purl.obolibrary.org/obo/PO_0007115
2,E-CURD-1,ERR274309,ecotype,Col-0,http://www.ebi.ac.uk/efo/EFO_0005148
3,E-CURD-1,ERR274309,genotype,wild type,
4,E-CURD-1,ERR274309,growth condition,10/14 h light/dark period and a light intensit...,
...,...,...,...,...,...
24009,E-MTAB-964,ERR163136,cultivar,Zheng58 inbred line,
24010,E-MTAB-964,ERR163136,developmental stage,flowering stage,http://purl.obolibrary.org/obo/PO_0007616
24011,E-MTAB-964,ERR163136,organism,Zea mays,http://purl.obolibrary.org/obo/NCBITaxon_4577
24012,E-MTAB-964,ERR163136,organism part,immature style,


In [21]:
columns_df_ea_condensed_sdrf_where_run_characteristics = list(set(df_ea_condensed_sdrf_where_plant_baseline['characteristic_key']))
columns_df_ea_condensed_sdrf_where_run_characteristics.append('run')
columns_df_ea_condensed_sdrf_where_run_characteristics = sorted(columns_df_ea_condensed_sdrf_where_run_characteristics)

list_run_characteristic = list()
for i in tqdm(list(set(df_ea_condensed_sdrf_where_plant_baseline['run']))):
    tmp_dict_run_characteristic = dict()
    tmp_dict_run_characteristic['run'] = i
    for index,row in df_ea_condensed_sdrf_where_plant_baseline.set_index('run').loc[i][['characteristic_key','characteristic_value']].iterrows():
        tmp_dict_run_characteristic[row['characteristic_key']] = row['characteristic_value']
    list_run_characteristic.append(tmp_dict_run_characteristic)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3405/3405 [00:19<00:00, 171.79it/s]


In [23]:
df_ea_condensed_sdrf_where_run_characteristics = pd.DataFrame(columns=columns_df_ea_condensed_sdrf_where_run_characteristics).set_index('run')

for i in tqdm(list_run_characteristic):
    df = pd.DataFrame(i,index=[i['run']])
    df_ea_condensed_sdrf_where_run_characteristics = pd.concat([df_ea_condensed_sdrf_where_run_characteristics,df])

df_ea_condensed_sdrf_where_run_characteristics

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3405/3405 [00:08<00:00, 409.06it/s]


In [None]:
df_stringid_uniprotid_ensemblgenomeid_og_for_eggnog33090 = pd.read_csv('/10Gdata/sjt/data/dataframe/df_stringid_uniprotid_ensemblgenomeid_og_for_eggnog33090.tsv',sep='\t')
dic_ensemblgenomeid_og = dict()

for key,value in zip(df_stringid_uniprotid_ensemblgenomeid_og_for_eggnog33090['ensemblgenomeid'],df_stringid_uniprotid_ensemblgenomeid_og_for_eggnog33090['og']):
    dic_ensemblgenomeid_og[key] = value

def get_df_median(df):
    col = df.columns
    for i in df.columns[2:]:
        df[i] = df[i].apply(lambda x : np.median(np.array([float(v) for v in x.split(',')])))
    return df

def get_og_column(x):
    try:
        return dic_ensemblgenomeid_og[x]
    except:
        pass
    
def get_df_dataset_tpm(configurationfile,tpmsfile):
    tree = ET.parse(configurationfile)
    root = tree.getroot()
    dic_id_assays = dict()
    for assay_group in root[0][0].findall('assay_group'):
        id = assay_group.get('id')
        assays = list()
        for assay in assay_group:
            assays.append(assay.text)
        dic_id_assays[id] = ",".join(assays)

    df = pd.read_csv(tpmsfile,sep='\t')
    df = get_df_median(df)
    df.rename(columns=dic_id_assays,inplace=True)
    df['og'] = df['GeneID'].apply(lambda x : get_og_column(x))
    df = df.dropna(subset=['og']).set_index(['og']).groupby(level=0).sum()
    return df.sort_index()

In [None]:
ea_plant_baseline_configurationfiles = pc.DataFile(
    '/10Gdata/sjt/data/database/expression_atlas/condensed_plant_baseline_107',
    'configuration.xml',
    'iso-8859-1',
    '\t'
)

ea_plant_baseline_tpmfiles = pc.DataFile(
    '/10Gdata/sjt/data/database/expression_atlas/condensed_plant_baseline_107',
    'tpms.tsv',
    'iso-8859-1',
    '\t'
)

configurationfiles = sorted(ea_plant_baseline_configurationfiles.get_path())
tpmfiles = sorted(ea_plant_baseline_tpmfiles.get_path())

In [None]:
df_dataset_tpm = pd.DataFrame()

for configurationfile,tpmfile in tqdm(zip(configurationfiles,tpmfiles)):
    df = get_df_dataset_tpm(configurationfile,tpmfile)
    df_dataset_tpm = pd.merge(df_dataset_tpm,df,left_index=True,right_index=True,how='outer')

df_dataset_tpm.drop([i for i in df_dataset_tpm.columns if i.startswith('Gene Name')],axis=1,inplace=True)

df_dataset_tpm