In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_parquet("data/GSE49828_methylation_matrix_annotated.parquet")
# List of chromosomes in order
ordered_chrs = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"]

# Make it a Categorical for correct sorting
df["chr"] = pd.Categorical(df["chr"], categories=ordered_chrs, ordered=True)
df

Unnamed: 0,chr,cpg_pos,1st_PB1,1st_PB2,1st_PB3,2-cell1,2-cell2,2nd_PB1,2nd_PB2,4-cell1,...,Sperm4,TE1,TE2,TE3,Zygote1,Zygote2,region_id,region_type,gene,cpg_context
0,chr1,10609,,,,,,,,,...,,,,,,,promoter:DDX11L1,promoter,DDX11L1,open_sea
1,chr1,10617,,,,,,,,,...,,,,,,,promoter:DDX11L1,promoter,DDX11L1,open_sea
2,chr1,10620,,,,,,,,,...,,,,,,,promoter:DDX11L1,promoter,DDX11L1,open_sea
3,chr1,10631,,,,,,,,,...,,,,,,,promoter:DDX11L1,promoter,DDX11L1,open_sea
4,chr1,10867,,,,,,,,,...,,,,,,,promoter:DDX11L1,promoter,DDX11L1,open_sea
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5002167,chrY,59357712,,,,,,,,,...,1.000000,,,0.333333,,,gene_body:WASH6P,gene_body,WASH6P,open_sea
5002168,chrY,59357736,,,,,,,,,...,0.666667,,,0.666667,,,gene_body:WASH6P,gene_body,WASH6P,open_sea
5002169,chrY,59357765,,,,,,,,,...,,,,,,,gene_body:AJ271736.10,gene_body,AJ271736.10,open_sea
5002170,chrY,59357786,,,,,,,,,...,,,,,,,gene_body:WASH6P,gene_body,WASH6P,open_sea


In [5]:
# Check DataFrame structure
print("DataFrame shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst few rows:")
df.head()

DataFrame shape: (5002172, 38)

Columns: ['chr', 'cpg_pos', '1st_PB1', '1st_PB2', '1st_PB3', '2-cell1', '2-cell2', '2nd_PB1', '2nd_PB2', '4-cell1', '4-cell2', '8-cell1', '8-cell2', '8-cell3', 'ICM1', 'ICM2', 'ICM3', 'MII_Oocyte1', 'MII_Oocyte2', 'Morula1', 'Morula2', 'Morula3', 'Postimplantation_embryo1', 'Postimplantation_embryo2', 'Postimplantation_embryo3', 'Sperm1', 'Sperm2', 'Sperm3', 'Sperm4', 'TE1', 'TE2', 'TE3', 'Zygote1', 'Zygote2', 'region_id', 'region_type', 'gene', 'cpg_context']

First few rows:


Unnamed: 0,chr,cpg_pos,1st_PB1,1st_PB2,1st_PB3,2-cell1,2-cell2,2nd_PB1,2nd_PB2,4-cell1,...,Sperm4,TE1,TE2,TE3,Zygote1,Zygote2,region_id,region_type,gene,cpg_context
0,chr1,10609,,,,,,,,,...,,,,,,,promoter:DDX11L1,promoter,DDX11L1,open_sea
1,chr1,10617,,,,,,,,,...,,,,,,,promoter:DDX11L1,promoter,DDX11L1,open_sea
2,chr1,10620,,,,,,,,,...,,,,,,,promoter:DDX11L1,promoter,DDX11L1,open_sea
3,chr1,10631,,,,,,,,,...,,,,,,,promoter:DDX11L1,promoter,DDX11L1,open_sea
4,chr1,10867,,,,,,,,,...,,,,,,,promoter:DDX11L1,promoter,DDX11L1,open_sea


In [7]:
# Group by region_id and calculate mean methylation levels (removing NAs first)
# Get sample columns (excluding chr, cpg_pos, region_id, region_type, gene, cpg_context)
sample_columns = [col for col in df.columns if col not in ['chr', 'cpg_pos', 'region_id', 'region_type', 'gene', 'cpg_context']]

print(f"Sample columns: {sample_columns}")
print(f"Number of sample columns: {len(sample_columns)}")

# Group by region_id and calculate mean methylation (NAs are automatically skipped in pandas mean())
region_methylation_summary = df.groupby('region_id')[sample_columns].mean().reset_index()

print(f"\nRegion methylation summary shape: {region_methylation_summary.shape}")
region_methylation_summary.head()

Sample columns: ['1st_PB1', '1st_PB2', '1st_PB3', '2-cell1', '2-cell2', '2nd_PB1', '2nd_PB2', '4-cell1', '4-cell2', '8-cell1', '8-cell2', '8-cell3', 'ICM1', 'ICM2', 'ICM3', 'MII_Oocyte1', 'MII_Oocyte2', 'Morula1', 'Morula2', 'Morula3', 'Postimplantation_embryo1', 'Postimplantation_embryo2', 'Postimplantation_embryo3', 'Sperm1', 'Sperm2', 'Sperm3', 'Sperm4', 'TE1', 'TE2', 'TE3', 'Zygote1', 'Zygote2']
Number of sample columns: 32

Region methylation summary shape: (77932, 33)


Unnamed: 0,region_id,1st_PB1,1st_PB2,1st_PB3,2-cell1,2-cell2,2nd_PB1,2nd_PB2,4-cell1,4-cell2,...,Postimplantation_embryo3,Sperm1,Sperm2,Sperm3,Sperm4,TE1,TE2,TE3,Zygote1,Zygote2
0,gene_body:5S_rRNA,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,0.972222,...,,,1.0,1.0,1.0,,,1.0,1.0,0.5
1,gene_body:7SK,0.4,0.941919,0.641667,,0.469841,,0.270769,0.314286,0.447619,...,1.0,,0.857143,1.0,1.0,0.642857,,0.894,0.894737,
2,gene_body:A1BG,1.0,0.262069,1.0,0.0,0.02381,0.990654,1.0,0.73029,0.266332,...,0.333754,0.0,0.0,0.04902,0.014286,0.12381,0.186275,0.378641,0.31746,0.716981
3,gene_body:A1BG-AS1,0.979699,0.665644,0.963183,0.322511,0.115048,0.989716,1.0,0.622072,0.533105,...,0.452411,0.321039,0.354694,0.386745,0.230126,0.378087,0.312595,0.463619,0.568925,0.744633
4,gene_body:A1CF,,0.75,,,,,,0.395503,,...,0.93523,0.977273,0.925,0.944444,0.987013,,0.771429,,,


In [8]:
# Verify the results and show some statistics
print("Original DataFrame:")
print(f"  - Total rows: {len(df):,}")
print(f"  - Unique region_ids: {df['region_id'].nunique():,}")

print("\nGrouped DataFrame (region_methylation_summary):")
print(f"  - Total rows: {len(region_methylation_summary):,}")
print(f"  - Columns: {region_methylation_summary.shape[1]}")

# Check for any remaining NAs in the summary
na_counts = region_methylation_summary.isnull().sum()
print(f"\nNA counts in summary (should be mostly 0):")
print(na_counts[na_counts > 0])  # Only show columns with NAs

print("\nFirst few rows of the summary:")
print(region_methylation_summary.head())

Original DataFrame:
  - Total rows: 5,002,172
  - Unique region_ids: 77,932

Grouped DataFrame (region_methylation_summary):
  - Total rows: 77,932
  - Columns: 33

NA counts in summary (should be mostly 0):
1st_PB1                     16653
1st_PB2                     16904
1st_PB3                     18315
2-cell1                     21285
2-cell2                     17159
2nd_PB1                     22870
2nd_PB2                     18603
4-cell1                     13088
4-cell2                     16354
8-cell1                     15414
8-cell2                     15181
8-cell3                     13790
ICM1                        11416
ICM2                        16650
ICM3                        16229
MII_Oocyte1                 18369
MII_Oocyte2                 18556
Morula1                     15065
Morula2                     16351
Morula3                     17741
Postimplantation_embryo1     7337
Postimplantation_embryo2     7014
Postimplantation_embryo3     7379
Sperm1    

In [10]:
region_methylation_summary.dropna()

Unnamed: 0,region_id,1st_PB1,1st_PB2,1st_PB3,2-cell1,2-cell2,2nd_PB1,2nd_PB2,4-cell1,4-cell2,...,Postimplantation_embryo3,Sperm1,Sperm2,Sperm3,Sperm4,TE1,TE2,TE3,Zygote1,Zygote2
2,gene_body:A1BG,1.000000,0.262069,1.000000,0.000000,0.023810,0.990654,1.000000,0.730290,0.266332,...,0.333754,0.000000,0.000000,0.049020,0.014286,0.123810,0.186275,0.378641,0.317460,0.716981
3,gene_body:A1BG-AS1,0.979699,0.665644,0.963183,0.322511,0.115048,0.989716,1.000000,0.622072,0.533105,...,0.452411,0.321039,0.354694,0.386745,0.230126,0.378087,0.312595,0.463619,0.568925,0.744633
7,gene_body:A2ML1,0.889796,0.931319,0.808739,0.638109,0.562743,0.819634,0.839001,0.549611,0.514520,...,0.933643,0.731364,0.792187,0.678917,0.754684,0.451085,0.568752,0.612398,0.809396,0.642192
8,gene_body:A2ML1-AS1,0.797539,1.000000,0.677237,0.250045,0.666667,0.904762,0.867021,0.593148,0.550346,...,0.913589,0.871813,0.867954,0.849887,0.811925,0.641275,0.678083,0.556845,0.731094,0.558554
9,gene_body:A2MP1,0.095382,0.656283,0.124928,0.207510,0.139519,0.100000,0.319298,0.072720,0.005540,...,0.955919,0.017094,0.240077,0.068504,0.094895,0.015457,0.085784,0.110850,0.017857,0.048436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77919,promoter:hsa-mir-6080,0.158939,0.169672,0.188374,0.018804,0.053523,0.112508,0.200000,0.103717,0.041618,...,0.007155,0.000000,0.004399,0.001983,0.010840,0.012943,0.025098,0.114768,0.099763,0.046512
77923,promoter:snoMe28S-Am2634,0.309492,0.393351,0.044444,0.541667,0.421846,0.218674,0.047619,0.189343,0.250000,...,0.836285,0.954545,0.958137,0.958888,0.982431,0.396871,0.000000,0.094895,0.502630,0.222222
77925,promoter:snoU109,0.514815,0.476190,0.636364,0.658292,0.368278,1.000000,0.737700,0.399259,0.491399,...,0.980652,1.000000,0.996296,0.988235,0.979976,0.336835,0.477621,0.435184,0.780097,0.870985
77926,promoter:snoU13,0.527274,0.581168,0.541298,0.315408,0.377159,0.575488,0.524881,0.409227,0.366356,...,0.746535,0.678177,0.650981,0.658876,0.646550,0.311454,0.351593,0.373007,0.497522,0.444024
