## Notebook to prep info for Pools and Samples

In [1]:
!date

Sat Jul 17 18:35:19 EDT 2021


#### import libraries and set notebook variables

In [2]:
import pandas as pd

In [3]:
# naming
cohort = 'aging'

# directories
home_dir = '/labshare/raph/datasets/adrd_neuro'
wrk_dir = f'{home_dir}/{cohort}'
info_dir = f'{wrk_dir}/sample_info'

# in files
pool_file = f'{info_dir}/pool_info.csv'
sample_file = f'{info_dir}/patient_sample_info.csv'

# out files
info_file = f'{info_dir}/{cohort}.pool_patient_sample_info.csv'

# contants
bank_abbrs = {'NIMH Human Brain Collection Core': 'NHBCC', 
              'U. Pittsburgh Brain Tissue Donation Program': 'UPBTDP', 
              'U. Maryland Brain & Tissue Bank': 'UMBTB'}

#### read the input data files

In [4]:
pool_df = pd.read_csv(pool_file)
print(pool_df.shape)
display(pool_df.head())

(48, 3)


Unnamed: 0,Pool_no,Sample_no,Sequence_type
0,POOL01,Aging111,RNA
1,POOL01,Aging095,RNA
2,POOL01,Aging080,RNA
3,POOL01,Aging088,RNA
4,POOL01,Aging099,RNA


In [5]:
sample_df = pd.read_csv(sample_file)
print(sample_df.shape)
# display(sample_df.head())

(118, 7)


#### for the Pool create pool name for the pool id

In [6]:
pool_df['pool_name'] = pool_df['Pool_no'].str.replace('POOL','').astype('int32').apply(lambda x: f'P{x:03d}')
print(pool_df.shape)
# display(pool_df.sample(10))

(48, 4)


#### for the sample id df create a donor id from the tissue bank and tissue bank source id

In [7]:
sample_df['donor_id'] = sample_df['Tissue_source'].apply(bank_abbrs.get) + '-' \
+ sample_df['Source_id'].astype('string')
print(sample_df.shape)
display(sample_df.sample(10))

(118, 8)


Unnamed: 0,Sample_id,Source_id,Tissue_source,Brain_region,Clinical_diagnosis,Age,Sex,donor_id
60,Aging061,2521,NIMH Human Brain Collection Core,Subgenual anterior cingulate cortex,Healthy,51.1,Male,NHBCC-2521
6,Aging007,2972,NIMH Human Brain Collection Core,Locus coeruleus,Healthy,49.9,Male,NHBCC-2972
94,Aging095,1604,NIMH Human Brain Collection Core,Middle temporal gyrus,Healthy,27.2,Male,NHBCC-1604
24,Aging025,5079,U. Maryland Brain & Tissue Bank,Frontal cortex,Healthy,33.2,Male,UMBTB-5079
17,Aging018,2936,NIMH Human Brain Collection Core,Raphe nucleus,Healthy,66.1,Female,NHBCC-2936
16,Aging017,2936,NIMH Human Brain Collection Core,Locus coeruleus,Healthy,66.1,Female,NHBCC-2936
71,Aging072,831,NIMH Human Brain Collection Core,Subventricular zone,Healthy,85.2,Male,NHBCC-831
36,Aging037,13042,U. Pittsburgh Brain Tissue Donation Program,Brodmann area 4,Healthy,46.0,Female,UPBTDP-13042
49,Aging050,13390,U. Pittsburgh Brain Tissue Donation Program,Brodmann area 8,Healthy,33.0,Male,UPBTDP-13390
52,Aging053,1281,NIMH Human Brain Collection Core,Subgenual anterior cingulate cortex,Healthy,24.9,Male,NHBCC-1281


#### merge into info data frame using Aging Sample_id

In [8]:
info_df = pool_df.merge(sample_df, how='inner', left_on='Sample_no', right_on='Sample_id')
print(info_df.shape)
display(info_df.sample(10))

(48, 12)


Unnamed: 0,Pool_no,Sample_no,Sequence_type,pool_name,Sample_id,Source_id,Tissue_source,Brain_region,Clinical_diagnosis,Age,Sex,donor_id
30,POOL04,Aging104,RNA,P004,Aging104,1615,NIMH Human Brain Collection Core,Entorhinal cortex,Healthy,71.8,Female,NHBCC-1615
19,POOL03,Aging090,RNA,P003,Aging090,1340,NIMH Human Brain Collection Core,Putamen,Healthy,26.2,Female,NHBCC-1340
45,POOL06,Aging102,RNA,P006,Aging102,1613,NIMH Human Brain Collection Core,Subventricular zone,Healthy,78.0,Male,NHBCC-1613
37,POOL05,Aging072,RNA,P005,Aging072,831,NIMH Human Brain Collection Core,Subventricular zone,Healthy,85.2,Male,NHBCC-831
2,POOL01,Aging080,RNA,P001,Aging080,1187,NIMH Human Brain Collection Core,Middle temporal gyrus,Healthy,21.5,Female,NHBCC-1187
7,POOL01,Aging073,RNA,P001,Aging073,1119,NIMH Human Brain Collection Core,Middle temporal gyrus,Healthy,64.3,Female,NHBCC-1119
3,POOL01,Aging088,RNA,P001,Aging088,1340,NIMH Human Brain Collection Core,Middle temporal gyrus,Healthy,26.2,Female,NHBCC-1340
27,POOL04,Aging089,RNA,P004,Aging089,1340,NIMH Human Brain Collection Core,Entorhinal cortex,Healthy,26.2,Female,NHBCC-1340
15,POOL02,Aging105,RNA,P002,Aging105,1615,NIMH Human Brain Collection Core,Putamen,Healthy,71.8,Female,NHBCC-1615
9,POOL02,Aging076,RNA,P002,Aging076,1137,NIMH Human Brain Collection Core,Middle temporal gyrus,Healthy,68.6,Male,NHBCC-1137


#### save the merged info files

In [9]:
info_df.to_csv(info_file, index=False)