In [1]:
import pandas as pd

In [11]:
pidegree_info_url = "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/1kGP.3202_samples.pedigree_info.txt"


# Create DataFrame
df = pd.read_csv(pidegree_info_url, sep=' ', header=0, names=['child', 'father', 'mother', 'sex'], usecols=[0, 1, 2, 3])


In [8]:
df

Unnamed: 0,child,father,mother,sex
0,HG00096,0,0,1
1,HG00097,0,0,2
2,HG00099,0,0,2
3,HG00100,0,0,2
4,HG00101,0,0,1
...,...,...,...,...
3197,NA21137,0,0,2
3198,NA21141,0,0,2
3199,NA21142,0,0,2
3200,NA21143,0,0,2


In [13]:
df = df[(df['father'] != '0') & (df['mother'] != '0')]

# Identify quads
quad_parents = df.groupby(['father', 'mother']).filter(lambda x: len(x) > 1)[['father', 'mother']].drop_duplicates()

# Create DataFrame for quads
quad_list = []
for index, row in quad_parents.iterrows():
    children = df[(df['father'] == row['father']) & (df['mother'] == row['mother'])]
    if len(children) == 2:
        quad_list.append({
            'child1': children.iloc[0]['child'],
            'child2': children.iloc[1]['child'],
            'father': row['father'],
            'mother': row['mother'],
            'sex1': 'M' if children.iloc[0]['sex'] == 1 else 'F',
            'sex2': 'M' if children.iloc[1]['sex'] == 1 else 'F'
        })
quad_df = pd.DataFrame(quad_list)

# Exclude quads from the original DataFrame to create trios
quad_children = quad_df[['child1', 'child2']].values.flatten()
trios = df[~df['child'].isin(quad_children)]

# Create DataFrame for trios
trio_df = pd.DataFrame({
    'child': trios['child'],
    'father': trios['father'],
    'mother': trios['mother'],
    'sex': trios['sex'].replace({1: 'M', 2: 'F'})
})

In [14]:
trio_df

Unnamed: 0,child,father,mother,sex
187,HG00405,HG00403,HG00404,F
190,HG00408,HG00406,HG00407,F
195,HG00420,HG00418,HG00419,M
198,HG00423,HG00421,HG00422,F
201,HG00429,HG00427,HG00428,M
...,...,...,...,...
2953,NA19983,NA19982,NA19713,F
2957,NA20128,NA20126,NA20127,F
2958,NA20129,NA19920,NA19921,F
2962,NA20279,NA20278,NA20282,M


In [15]:
quad_df

Unnamed: 0,child1,child2,father,mother,sex1,sex2
0,HG00658,HG00702,HG00656,HG00657,M,F
1,HG03992,HG04036,HG03943,HG03944,M,F
2,HG04204,HG04215,HG03679,HG03642,M,M
3,NA19662,NA19685,NA19661,NA19660,F,M
4,NA19675,NA19680,NA19679,NA19678,F,F


In [17]:
trio_df.to_csv('trios.tsv', sep='\t', index=False)
quad_df.to_csv('quads.tsv', sep='\t', index=False)