In [1]:
import numpy as np
import os
import pandas as pd

In [2]:
def clean_and_split(file):
    df = pd.read_csv(file, sep='\t').dropna(subset=['FID'])
    file_pref = file[6:file.index('BCD')]
    for pheno in df.columns[2:]:
        subdf = df[['FID', 'IID', pheno]].copy().dropna()
        if subdf['IID'].dtype == np.float64:
            subdf['IID'] = subdf['IID'].astype(int)
        subdf[pheno] = subdf[pheno].astype(int)
        subdf.to_csv('pheno/' + file_pref + pheno + '.pheno', sep='\t', index=False)

In [3]:
for file in os.listdir('pheno'):
    if file[-5:] == 'pheno':
        continue
    print(file)
    clean_and_split('pheno/' + file)

11_SATU_C_CAPS4_BCD.txt
12_DEFE_C_CAPS4_BCD.txt
13_NHRV_L_GFK_PCL-5_BCDE.txt
13_NHRV_L_VAKN_PCL4_BCD.txt
17_KSUD_C_PCL-5_BCDE.txt
1_MRSC_C_MAX_CAPS4_BCD.txt
2_ONGA_L_MAX_PCL4_BCD.txt
33_RING_C_T1_PCL4_BCD.txt
43_WACH_C_PCL-5_BCDE.txt
47_GTPC_C_CAPS-5_BCDE.txt
48_BETR_C_CAPS4_BCD.txt
49_SEEP_L_CAPS4_BCD.txt
55_GALL_C_CAPS-5_BCDE.txt
57_BAKE_C_CAPS4_BCD.txt
58_VRIS_C_CAPS-5_BCDE.txt
5_NHS2_L_PCL4_BCD.txt
64_BRLS_C_CAPS-5_BCDE.txt
70_RCSS_C_BL_PCL-5_BCDE.txt
71_DELB_C_PCL-5_BCDE.txt
73_AURO_C_PCL-5_BCDE.txt
76_EHVP_C_CAPS-5_BCDE.txt
87_ONGB_L_MAX_PCL4_BCD.txt


In [4]:
b_counts = {'study': [], 'count_PHENO_B': []}
d_counts = {'study': [], 'count_PHENO_D': []}
for file in os.listdir('pheno'):
    if '_b_' in file or '_d_' in file:
        # everything before CAPS/PCL as study id
        caps_ind = file.upper().find('CAPS')
        pcl_ind = file.upper().find('PCL')
        id_ind = caps_ind if caps_ind != -1 else pcl_ind
        study = file[:id_ind-1]
        # number of subjects
        with open('pheno/' + file, 'r') as f:
            subjs = len(f.readlines())
        if '_b_' in file:
            b_counts['study'].append(study)
            b_counts['count_PHENO_B'].append(subjs)
        elif '_d_' in file:
            d_counts['study'].append(study)
            d_counts['count_PHENO_D'].append(subjs)
b_counts = pd.DataFrame(b_counts)
d_counts = pd.DataFrame(d_counts)

merged = b_counts.merge(d_counts, on='study')
merged.to_csv('PHENO_counts.csv', index=False)