## Notebook to prep info for Pools and Samples

In [1]:
!date

Sat Jul 17 18:35:19 EDT 2021


#### import libraries and set notebook variables

In [2]:
import pandas as pd

In [3]:
# naming
cohort = 'aging'

# directories
home_dir = '/labshare/raph/datasets/adrd_neuro'
wrk_dir = f'{home_dir}/{cohort}'
info_dir = f'{wrk_dir}/sample_info'

# in files
pool_file = f'{info_dir}/pool_info.csv'
sample_file = f'{info_dir}/patient_sample_info.csv'

# out files
info_file = f'{info_dir}/{cohort}.pool_patient_sample_info.csv'

# contants
bank_abbrs = {'NIMH Human Brain Collection Core': 'NHBCC', 
              'U. Pittsburgh Brain Tissue Donation Program': 'UPBTDP', 
              'U. Maryland Brain & Tissue Bank': 'UMBTB'}

#### read the input data files

In [4]:
pool_df = pd.read_csv(pool_file)
print(pool_df.shape)
display(pool_df.head())

(48, 3)


Unnamed: 0,Pool_no,Sample_no,Sequence_type
0,POOL01,Aging111,RNA
1,POOL01,Aging095,RNA
2,POOL01,Aging080,RNA
3,POOL01,Aging088,RNA
4,POOL01,Aging099,RNA


In [5]:
sample_df = pd.read_csv(sample_file)
print(sample_df.shape)
# display(sample_df.head())

(118, 7)


#### for the Pool create pool name for the pool id

In [6]:
pool_df['pool_name'] = pool_df['Pool_no'].str.replace('POOL','').astype('int32').apply(lambda x: f'P{x:03d}')
print(pool_df.shape)
# display(pool_df.sample(10))

(48, 4)


#### for the sample id df create a donor id from the tissue bank and tissue bank source id

In [None]:
sample_df['donor_id'] = sample_df['Tissue_source'].apply(bank_abbrs.get) + '-' \
+ sample_df['Source_id'].astype('string')
print(sample_df.shape)
# display(sample_df.sample(10))

#### merge into info data frame using Aging Sample_id

In [None]:
info_df = pool_df.merge(sample_df, how='inner', left_on='Sample_no', right_on='Sample_id')
print(info_df.shape)
# display(info_df.sample(10))

#### save the merged info files

In [9]:
info_df.to_csv(info_file, index=False)