Exploration notebook for the PP data

In [1]:
import polars as pl
from pathlib import Path
from pycomfort import files
from pycomfort.files import tprint
from eliot import start_action

In [2]:
base = Path.cwd().parent if "notebooks" in Path.cwd().name else Path.cwd()
base

PosixPath('/home/antonkulaga/sources/pdb-mcp')

In [3]:
data = base / "data"
input = data / "input"
output = data / "output"

In [4]:
tprint(input)

input
	uniprot
		idmapping_selected.tab
		.gitignore
	anage
		anage_data.txt
	pdb
		uniprot_segments_observed.tsv.gz
		pdb_chain_scop2b_sf_uniprot.tsv.gz
		pdb_chain_scop_uniprot.tsv.gz
		pdb_chain_pfam.tsv.gz
		pdb_chain_enzyme.tsv.gz
		pdb_chain_taxonomy.tsv.gz
		pdb_chain_cath_uniprot.tsv.gz
		pdb_chain_hmmer.tsv.gz
		pdb_chain_uniprot.tsv.gz
		pdb_chain_scop2_uniprot.tsv.gz
		.gitignore
		uniprot_pdb.tsv.gz
		pdb_chain_go.tsv.gz
		pdb_chain_interpro.tsv.gz
		pdb_pfam_mapping.tsv.gz
		pdb_chain_ensembl.tsv.gz
		pdb_pubmed.tsv.gz


In [5]:
tprint(output)

output
	PP.csv
	PRNA.jsonl.gz
	.~lock.PP.csv#
	Ppeptide.csv
	PP.jsonl.gz
	PP_embeddings.parquet
	Ppeptide.jsonl.gz
	PP_extended.parquet
	PRNA.csv
	PP_advanced.parquet
	.gitignore
	idmapping_selected.parquet
	protein_names.parquet
	PP_graph_embeddings.parquet


In [6]:
pp_path = output / "PP.csv"
pp = pl.scan_csv(pp_path)
pp.head().collect()

entry_id,pdb_id,chain_id,protein_name,organism,common_name,taxonomy_id,classification,max_longevity_yrs,kingdom,phylum,uniprot_ids
str,str,str,str,str,str,i64,str,f64,str,str,str
"""2kn7_1_A_B""","""2kn7""","""A""","""N/A (from TSV)""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""Q92889"""
"""8bb7_2_A_B""","""8bb7""","""A""","""N/A (from TSV)""","""Balb/c mouse""","""House mouse""",10090,"""Mammalia""",4.0,"""Animalia""","""Chordata""","""Q8CJH3"""
"""8bb7_2_A_B""","""8bb7""","""B""","""N/A (from TSV)""","""Balb/c mouse""","""House mouse""",10090,"""Mammalia""",4.0,"""Animalia""","""Chordata""","""Q8CJH3"""
"""5y4t_1_A_B""","""5y4t""","""A""","""N/A (from TSV)""","""Baker's yeast""","""Baker's yeast""",307796,"""Saccharomycetes""",0.04,"""Fungi""","""Ascomycota""","""A6ZY62"""
"""5bow_2_A_B""","""5bow""","""A""","""N/A (from TSV)""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""Q8WWZ1"""


In [7]:
mapping_parquet = output / "idmapping_selected.parquet"
mapping = pl.scan_parquet(mapping_parquet)
mapping.head().collect()


UniProtKB_AC,UniProtKB_ID,GeneID_EntrezGene,RefSeq,GI,PDB,GO,UniRef100,UniRef90,UniRef50,UniParc,PIR,NCBI_taxon,MIM,UniGene,PubMed,EMBL,EMBL_CDS,Ensembl,Ensembl_TRS,Ensembl_PRO,Additional_PubMed
str,str,str,str,list[str],str,list[str],str,str,str,str,str,i64,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""Q6GZX4""","""001R_FRG3G""",,"""YP_031579.1""","[""81941549"", "" 49237298""]",,"[""GO:0046782""]","""UniRef100_Q6GZX4""","""UniRef90_Q6GZX4""","""UniRef50_Q6GZX4""","""UPI00003B0FD4""",,654924,,,"[""15165820""]","[""AY548484""]","[""AAT09660.1""]",,,,
"""Q6GZX3""","""002L_FRG3G""",,"""YP_031580.1""","[""49237299"", "" 81941548""]",,"[""GO:0033644"", "" GO:0016020""]","""UniRef100_Q6GZX3""","""UniRef90_Q6GZX3""","""UniRef50_Q6GZX3""","""UPI00003B0FD5""",,654924,,,"[""15165820""]","[""AY548484""]","[""AAT09661.1""]",,,,
"""Q197F8""","""002R_IIV3""",,"""YP_654574.1""","[""109287880"", "" 123808694"", "" 106073503""]",,,"""UniRef100_Q197F8""","""UniRef90_Q197F8""","""UniRef50_Q197F8""","""UPI0000D83464""",,345201,,,"[""16912294""]","[""DQ643392""]","[""ABF82032.1""]",,,,
"""Q197F7""","""003L_IIV3""",,"""YP_654575.1""","[""106073504"", "" 109287881"", "" 123808693""]",,,"""UniRef100_Q197F7""","""UniRef90_Q197F7""","""UniRef50_Q197F7""","""UPI0000D83465""",,345201,,,"[""16912294""]","[""DQ643392""]","[""ABF82033.1""]",,,,
"""Q6GZX2""","""003R_FRG3G""",,"""YP_031581.1""","[""81941547"", "" 49237300""]",,,"""UniRef100_Q6GZX2""","""UniRef90_Q6GZX2""","""UniRef50_Q6GZX2""","""UPI00003B0FD6""",,654924,,,"[""15165820""]","[""AY548484""]","[""AAT09662.1""]",,,,


In [8]:
pp_extended = pp.join(mapping.with_columns(pl.col("UniProtKB_AC").alias("uniprot_ids")), on="uniprot_ids")

In [9]:
pp_extended.sink_parquet(output / "PP_extended.parquet")

In [13]:
pp_load = pl.scan_parquet(output / "PP_extended.parquet").rename({"entry_id": "id"})
pp_load.head().collect()


id,pdb_id,chain_id,protein_name,organism,common_name,taxonomy_id,classification,max_longevity_yrs,kingdom,phylum,uniprot_ids,UniProtKB_AC,UniProtKB_ID,GeneID_EntrezGene,RefSeq,GI,PDB,GO,UniRef100,UniRef90,UniRef50,UniParc,PIR,NCBI_taxon,MIM,UniGene,PubMed,EMBL,EMBL_CDS,Ensembl,Ensembl_TRS,Ensembl_PRO,Additional_PubMed
str,str,str,str,str,str,i64,str,f64,str,str,str,str,str,str,str,list[str],str,list[str],str,str,str,str,str,i64,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""7um4_1_A_B""","""7um4""","""B""","""N/A (from TSV)""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""Q9V2J8""","""Q9V2J8""","""Q9V2J8_PYRAB""",,"""WP_010867201.1""","[""83754399"", "" 283806864"", … "" 83754400""]","""2BFW:A; 2BIS:A; 2BIS:B; 2BIS:C…","[""GO:0004373""]","""UniRef100_Q9V2J8""","""UniRef90_Q9V2J8""","""UniRef50_C5A4D7""","""UPI00000346E2""","""A75194""",272844,,,"[""10736225"", "" 11381026"", … "" 39695122""]","[""AJ248283"", "" HE613800""]","[""CAB49000.1"", "" CCE69451.1""]",,,,
"""1gl2_1_A_C""","""1gl2""","""C""","""N/A (from TSV)""","""Balb/c mouse""","""House mouse""",10090,"""Mammalia""",4.0,"""Animalia""","""Chordata""","""O88384""","""O88384""","""VTI1B_MOUSE""",,,"[""13124606"", "" 18655560"", … "" 3213229""]","""1GL2:C; 2QYW:A""","[""GO:0031901"", "" GO:0005794"", … "" GO:0016192""]","""UniRef100_O88384""","""UniRef90_O88384""","""UniRef50_O88384""","""UPI0000029675""",,10090,,,"[""9553086"", "" 15363411"", … "" 11786915""]","[""AF035208""]","[""AAC23483.1""]",,,,"[""12861006"", "" 15640147"", … "" 23258225""]"
"""1gl2_1_C_B""","""1gl2""","""C""","""N/A (from TSV)""","""Balb/c mouse""","""House mouse""",10090,"""Mammalia""",4.0,"""Animalia""","""Chordata""","""O88384""","""O88384""","""VTI1B_MOUSE""",,,"[""13124606"", "" 18655560"", … "" 3213229""]","""1GL2:C; 2QYW:A""","[""GO:0031901"", "" GO:0005794"", … "" GO:0016192""]","""UniRef100_O88384""","""UniRef90_O88384""","""UniRef50_O88384""","""UPI0000029675""",,10090,,,"[""9553086"", "" 15363411"", … "" 11786915""]","[""AF035208""]","[""AAC23483.1""]",,,,"[""12861006"", "" 15640147"", … "" 23258225""]"
"""1gl2_1_D_C""","""1gl2""","""C""","""N/A (from TSV)""","""Balb/c mouse""","""House mouse""",10090,"""Mammalia""",4.0,"""Animalia""","""Chordata""","""O88384""","""O88384""","""VTI1B_MOUSE""",,,"[""13124606"", "" 18655560"", … "" 3213229""]","""1GL2:C; 2QYW:A""","[""GO:0031901"", "" GO:0005794"", … "" GO:0016192""]","""UniRef100_O88384""","""UniRef90_O88384""","""UniRef50_O88384""","""UPI0000029675""",,10090,,,"[""9553086"", "" 15363411"", … "" 11786915""]","[""AF035208""]","[""AAC23483.1""]",,,,"[""12861006"", "" 15640147"", … "" 23258225""]"
"""7xrb_1_A_B""","""7xrb""","""A""","""N/A (from TSV)""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""P49842""","""P49842""","""WHR1_HUMAN""","""8859""","""NP_004188.1; NP_115830.1""","[""2347132"", "" 4759180"", … "" 825661""]","""7XRB:A; 7XRB:B; 8YCM:A; 8YCM:B…","[""GO:0005737"", "" GO:0016607"", … "" GO:1903518""]","""UniRef100_P49842""","""UniRef90_P49842""","""UniRef50_P49842""","""UPI000002B442""","""B53439""",9606,"""604977""",,"[""8012361"", "" 8132574"", … "" 17344846""]","[""X77386"", "" X77474"", … "" U24578""]","[""CAA54565.1"", "" CAA54622.1"", … "" AAA99716.1""]","[""ENSG00000204344.16"", "" ENSG00000206342.12"", … "" ENSG00000226033.11""]","[""ENST00000375331.8"", "" ENST00000375333.4"", … "" ENST00000685781.1""]","[""ENSP00000364480.4"", "" ENSP00000364482.4"", … "" ENSP00000509445.1""]","[""19423540"", "" 19851445"", … "" 32531245""]"


In [14]:
embeddings = pl.scan_parquet(output / "PP_embeddings.parquet").select(["id", "graph_embedding"])
embeddings.head(1).collect()

id,graph_embedding
str,list[f64]
"""2kn7_1_A_B""","[0.000052, 0.000035, … 0.000145]"


In [15]:
graph_embeddings = pp_load.join(embeddings, on="id")
graph_embeddings.head().collect()

id,pdb_id,chain_id,protein_name,organism,common_name,taxonomy_id,classification,max_longevity_yrs,kingdom,phylum,uniprot_ids,UniProtKB_AC,UniProtKB_ID,GeneID_EntrezGene,RefSeq,GI,PDB,GO,UniRef100,UniRef90,UniRef50,UniParc,PIR,NCBI_taxon,MIM,UniGene,PubMed,EMBL,EMBL_CDS,Ensembl,Ensembl_TRS,Ensembl_PRO,Additional_PubMed,graph_embedding
str,str,str,str,str,str,i64,str,f64,str,str,str,str,str,str,str,list[str],str,list[str],str,str,str,str,str,i64,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[f64]
"""7um4_1_A_B""","""7um4""","""B""","""N/A (from TSV)""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""Q9V2J8""","""Q9V2J8""","""Q9V2J8_PYRAB""",,"""WP_010867201.1""","[""83754399"", "" 283806864"", … "" 83754400""]","""2BFW:A; 2BIS:A; 2BIS:B; 2BIS:C…","[""GO:0004373""]","""UniRef100_Q9V2J8""","""UniRef90_Q9V2J8""","""UniRef50_C5A4D7""","""UPI00000346E2""","""A75194""",272844,,,"[""10736225"", "" 11381026"", … "" 39695122""]","[""AJ248283"", "" HE613800""]","[""CAB49000.1"", "" CCE69451.1""]",,,,,"[-0.000011, -0.000071, … -0.000207]"
"""1gl2_1_A_C""","""1gl2""","""C""","""N/A (from TSV)""","""Balb/c mouse""","""House mouse""",10090,"""Mammalia""",4.0,"""Animalia""","""Chordata""","""O88384""","""O88384""","""VTI1B_MOUSE""",,,"[""13124606"", "" 18655560"", … "" 3213229""]","""1GL2:C; 2QYW:A""","[""GO:0031901"", "" GO:0005794"", … "" GO:0016192""]","""UniRef100_O88384""","""UniRef90_O88384""","""UniRef50_O88384""","""UPI0000029675""",,10090,,,"[""9553086"", "" 15363411"", … "" 11786915""]","[""AF035208""]","[""AAC23483.1""]",,,,"[""12861006"", "" 15640147"", … "" 23258225""]","[-0.000017, -0.00007, … -0.000194]"
"""1gl2_1_C_B""","""1gl2""","""C""","""N/A (from TSV)""","""Balb/c mouse""","""House mouse""",10090,"""Mammalia""",4.0,"""Animalia""","""Chordata""","""O88384""","""O88384""","""VTI1B_MOUSE""",,,"[""13124606"", "" 18655560"", … "" 3213229""]","""1GL2:C; 2QYW:A""","[""GO:0031901"", "" GO:0005794"", … "" GO:0016192""]","""UniRef100_O88384""","""UniRef90_O88384""","""UniRef50_O88384""","""UPI0000029675""",,10090,,,"[""9553086"", "" 15363411"", … "" 11786915""]","[""AF035208""]","[""AAC23483.1""]",,,,"[""12861006"", "" 15640147"", … "" 23258225""]","[0.000041, 0.000043, … 0.000209]"
"""1gl2_1_D_C""","""1gl2""","""C""","""N/A (from TSV)""","""Balb/c mouse""","""House mouse""",10090,"""Mammalia""",4.0,"""Animalia""","""Chordata""","""O88384""","""O88384""","""VTI1B_MOUSE""",,,"[""13124606"", "" 18655560"", … "" 3213229""]","""1GL2:C; 2QYW:A""","[""GO:0031901"", "" GO:0005794"", … "" GO:0016192""]","""UniRef100_O88384""","""UniRef90_O88384""","""UniRef50_O88384""","""UPI0000029675""",,10090,,,"[""9553086"", "" 15363411"", … "" 11786915""]","[""AF035208""]","[""AAC23483.1""]",,,,"[""12861006"", "" 15640147"", … "" 23258225""]","[0.000034, 0.000033, … 0.000201]"
"""7xrb_1_A_B""","""7xrb""","""A""","""N/A (from TSV)""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""P49842""","""P49842""","""WHR1_HUMAN""","""8859""","""NP_004188.1; NP_115830.1""","[""2347132"", "" 4759180"", … "" 825661""]","""7XRB:A; 7XRB:B; 8YCM:A; 8YCM:B…","[""GO:0005737"", "" GO:0016607"", … "" GO:1903518""]","""UniRef100_P49842""","""UniRef90_P49842""","""UniRef50_P49842""","""UPI000002B442""","""B53439""",9606,"""604977""",,"[""8012361"", "" 8132574"", … "" 17344846""]","[""X77386"", "" X77474"", … "" U24578""]","[""CAA54565.1"", "" CAA54622.1"", … "" AAA99716.1""]","[""ENSG00000204344.16"", "" ENSG00000206342.12"", … "" ENSG00000226033.11""]","[""ENST00000375331.8"", "" ENST00000375333.4"", … "" ENST00000685781.1""]","[""ENSP00000364480.4"", "" ENSP00000364482.4"", … "" ENSP00000509445.1""]","[""19423540"", "" 19851445"", … "" 32531245""]","[0.000053, 0.000033, … 0.000146]"


In [16]:
graph_embeddings.sink_parquet(output / "PP_graph_embeddings.parquet")

In [17]:
pp_load.count().collect()

id,pdb_id,chain_id,protein_name,organism,common_name,taxonomy_id,classification,max_longevity_yrs,kingdom,phylum,uniprot_ids,UniProtKB_AC,UniProtKB_ID,GeneID_EntrezGene,RefSeq,GI,PDB,GO,UniRef100,UniRef90,UniRef50,UniParc,PIR,NCBI_taxon,MIM,UniGene,PubMed,EMBL,EMBL_CDS,Ensembl,Ensembl_TRS,Ensembl_PRO,Additional_PubMed
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
59917,59917,59917,59917,59917,59917,59917,59917,59779,59917,59917,59917,59917,59917,55936,57248,59812,59808,59611,59917,59917,59917,59917,39714,59917,31595,0,59859,59200,59200,37695,37695,37695,57275


In [18]:
graph_embeddings.count().collect()

id,pdb_id,chain_id,protein_name,organism,common_name,taxonomy_id,classification,max_longevity_yrs,kingdom,phylum,uniprot_ids,UniProtKB_AC,UniProtKB_ID,GeneID_EntrezGene,RefSeq,GI,PDB,GO,UniRef100,UniRef90,UniRef50,UniParc,PIR,NCBI_taxon,MIM,UniGene,PubMed,EMBL,EMBL_CDS,Ensembl,Ensembl_TRS,Ensembl_PRO,Additional_PubMed,graph_embedding
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
59385,59385,59385,59385,59385,59385,59385,59385,59247,59385,59385,59385,59385,59385,55440,56734,59288,59276,59091,59385,59385,59385,59385,39365,59385,31344,0,59327,58676,58676,37404,37404,37404,56783,59385


In [30]:
graph_embeddings.drop(pl.col("protein_name")).sink_parquet(output / "PP_graph_embeddings.parquet")

In [6]:
pp_advanced = pl.read_parquet( output / "PP_advanced.parquet")
pp_advanced.head()

id,pdb_id,chain_id,organism,common_name,taxonomy_id,classification,max_longevity_yrs,kingdom,phylum,uniprot_ids,UniProtKB_AC,UniProtKB_ID,GeneID_EntrezGene,RefSeq,GI,PDB,GO,UniRef100,UniRef90,UniRef50,UniParc,PIR,NCBI_taxon,MIM,UniGene,PubMed,EMBL,EMBL_CDS,Ensembl,Ensembl_TRS,Ensembl_PRO,Additional_PubMed,graph_embedding,uniprot_protein_names,uniprot_gene_names,uniprot_organism_name,primary_gene_name,ensembl_id
str,str,str,str,str,i64,str,f64,str,str,str,str,str,str,str,list[str],str,list[str],str,str,str,str,str,i64,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[f64],str,str,str,str,str
"""2vhh_1_G_D""","""2vhh""","""D""","""Drosophila melangaster""","""Fruit fly""",7227,"""Insecta""",0.3,"""Animalia""","""Arthropoda""","""Q9VI04""","""Q9VI04""","""Q9VI04_DROME""","""40916""","""NP_649732.1""","[""21358471"", "" 170785054"", … "" 170785060""]","""2VHH:A; 2VHH:B; 2VHH:C; 2VHH:D…","[""GO:0005737"", "" GO:0003837"", … "" GO:0006208""]","""UniRef100_Q9VI04""","""UniRef90_Q9VI04""","""UniRef50_Q8H183""","""UPI000007A458""",,7227,,,"[""10731132"", "" 11454750"", … "" 25589440""]","[""AE014297"", "" AF333187""]","[""AAF54141.2"", "" AAK60520.1""]",,,,"[""14630943"", "" 16361227"", … "" 28562601""]","[0.000061, 0.000042, … 0.000181]",,,,"""pyd3""",
"""2vhh_1_D_C""","""2vhh""","""D""","""Drosophila melangaster""","""Fruit fly""",7227,"""Insecta""",0.3,"""Animalia""","""Arthropoda""","""Q9VI04""","""Q9VI04""","""Q9VI04_DROME""","""40916""","""NP_649732.1""","[""21358471"", "" 170785054"", … "" 170785060""]","""2VHH:A; 2VHH:B; 2VHH:C; 2VHH:D…","[""GO:0005737"", "" GO:0003837"", … "" GO:0006208""]","""UniRef100_Q9VI04""","""UniRef90_Q9VI04""","""UniRef50_Q8H183""","""UPI000007A458""",,7227,,,"[""10731132"", "" 11454750"", … "" 25589440""]","[""AE014297"", "" AF333187""]","[""AAF54141.2"", "" AAK60520.1""]",,,,"[""14630943"", "" 16361227"", … "" 28562601""]","[0.000055, 0.000057, … 0.000212]",,,,"""pyd3""",
"""2vhh_1_D_C""","""2vhh""","""C""","""Drosophila melangaster""","""Fruit fly""",7227,"""Insecta""",0.3,"""Animalia""","""Arthropoda""","""Q9VI04""","""Q9VI04""","""Q9VI04_DROME""","""40916""","""NP_649732.1""","[""21358471"", "" 170785054"", … "" 170785060""]","""2VHH:A; 2VHH:B; 2VHH:C; 2VHH:D…","[""GO:0005737"", "" GO:0003837"", … "" GO:0006208""]","""UniRef100_Q9VI04""","""UniRef90_Q9VI04""","""UniRef50_Q8H183""","""UPI000007A458""",,7227,,,"[""10731132"", "" 11454750"", … "" 25589440""]","[""AE014297"", "" AF333187""]","[""AAF54141.2"", "" AAK60520.1""]",,,,"[""14630943"", "" 16361227"", … "" 28562601""]","[0.000055, 0.000057, … 0.000212]",,,,"""pyd3""",
"""5noo_1_A_B""","""5noo""","""A""","""Caenorhabditis elegans""","""Roundworm""",6239,"""Chromadorea""",0.16,"""Animalia""","""Nematoda""","""Q9Y052""","""Q9Y052""","""Q9Y052_CAEEL""",,,"[""757819114"", "" 576865043"", … "" 4039150""]","""4IQB:A; 4IQB:B; 4IRR:A; 4IRR:B…","[""GO:0004799"", "" GO:0006231"", … "" GO:0032259""]","""UniRef100_Q9Y052""","""UniRef90_Q9Y052""","""UniRef50_A0A3B5LXK2""","""UPI0000081F72""","""T52178""",6239,,,"[""24321279"", "" 28826032""]","[""AF099673""]","[""AAC97508.1""]",,,,,"[0.000055, 0.000052, … 0.000191]",,,,,
"""5noo_1_A_B""","""5noo""","""B""","""Caenorhabditis elegans""","""Roundworm""",6239,"""Chromadorea""",0.16,"""Animalia""","""Nematoda""","""Q9Y052""","""Q9Y052""","""Q9Y052_CAEEL""",,,"[""757819114"", "" 576865043"", … "" 4039150""]","""4IQB:A; 4IQB:B; 4IRR:A; 4IRR:B…","[""GO:0004799"", "" GO:0006231"", … "" GO:0032259""]","""UniRef100_Q9Y052""","""UniRef90_Q9Y052""","""UniRef50_A0A3B5LXK2""","""UPI0000081F72""","""T52178""",6239,,,"[""24321279"", "" 28826032""]","[""AF099673""]","[""AAC97508.1""]",,,,,"[0.000055, 0.000052, … 0.000191]",,,,,


In [10]:
adv = pp_advanced.filter(pl.col("uniprot_protein_names").is_not_null())
adv

id,pdb_id,chain_id,organism,common_name,taxonomy_id,classification,max_longevity_yrs,kingdom,phylum,uniprot_ids,UniProtKB_AC,UniProtKB_ID,GeneID_EntrezGene,RefSeq,GI,PDB,GO,UniRef100,UniRef90,UniRef50,UniParc,PIR,NCBI_taxon,MIM,UniGene,PubMed,EMBL,EMBL_CDS,Ensembl,Ensembl_TRS,Ensembl_PRO,Additional_PubMed,graph_embedding,uniprot_protein_names,uniprot_gene_names,uniprot_organism_name,primary_gene_name,ensembl_id
str,str,str,str,str,i64,str,f64,str,str,str,str,str,str,str,list[str],str,list[str],str,str,str,str,str,i64,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[f64],str,str,str,str,str
"""6v9w_1_A_B""","""6v9w""","""A""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""O75762""","""O75762""","""TRPA1_HUMAN""","""8989""","""NP_015628.2; XP_011515927.1; X…","[""806393917"", "" 806393916"", … "" 3287188""]","""3J9P:A; 3J9P:B; 3J9P:C; 3J9P:D…","[""GO:0016324"", "" GO:0030424"", … "" GO:0014832""]","""UniRef100_O75762""","""UniRef90_O75762""","""UniRef50_O75762""","""UPI000021081A""",,9606,"""604775; 615040""",,"[""10066796"", "" 16421571"", … "" 20547126""]","[""Y10601"", "" AC022867""]","[""CAA71610.1"", "" -""]","[""ENSG00000104321.11""]","[""ENST00000262209.5""]","[""ENSP00000262209.4""]","[""14712238"", "" 16500080"", … "" 20716668""]","[0.000044, 0.000054, … 0.000227]","""Transient receptor potential c…","""TRPA1 ANKTM1""","""Homo sapiens (Human)""","""TRPA1""","""ENSG00000104321.11"""
"""6v9w_1_A_B""","""6v9w""","""B""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""O75762""","""O75762""","""TRPA1_HUMAN""","""8989""","""NP_015628.2; XP_011515927.1; X…","[""806393917"", "" 806393916"", … "" 3287188""]","""3J9P:A; 3J9P:B; 3J9P:C; 3J9P:D…","[""GO:0016324"", "" GO:0030424"", … "" GO:0014832""]","""UniRef100_O75762""","""UniRef90_O75762""","""UniRef50_O75762""","""UPI000021081A""",,9606,"""604775; 615040""",,"[""10066796"", "" 16421571"", … "" 20547126""]","[""Y10601"", "" AC022867""]","[""CAA71610.1"", "" -""]","[""ENSG00000104321.11""]","[""ENST00000262209.5""]","[""ENSP00000262209.4""]","[""14712238"", "" 16500080"", … "" 20716668""]","[0.000044, 0.000054, … 0.000227]","""Transient receptor potential c…","""TRPA1 ANKTM1""","""Homo sapiens (Human)""","""TRPA1""","""ENSG00000104321.11"""
"""6zfx_1_G_H""","""6zfx""","""G""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""Q6SZW1""","""Q6SZW1""","""SARM1_HUMAN""","""23098""","""NP_055892.2""","[""83288284"", "" 7711002"", … "" 154090976""]","""6O0Q:A; 6O0Q:B; 6O0R:A; 6O0R:B…","[""GO:0030424"", "" GO:0009986"", … "" GO:0007165""]","""UniRef100_Q6SZW1""","""UniRef90_Q6SZW1""","""UniRef50_Q6SZW1""","""UPI000021FB54""",,9606,"""607732""",,"[""11386760"", "" 9628581"", … "" 33053563""]","[""AJ290445"", "" AY444166"", "" AB011096""]","[""CAB90355.1"", "" AAR17520.1"", "" BAA25450.1""]","[""ENSG00000004139.14""]","[""ENST00000585482.6""]","[""ENSP00000468032.2""]","[""15893701"", "" 17804407"", … "" 16306936""]","[0.000058, 0.000053, … 0.0002]","""NAD(+) hydrolase SARM1 (NADase…","""SARM1 KIAA0524 SAMD2 SARM""","""Homo sapiens (Human)""","""SARM1""","""ENSG00000004139.14"""
"""6zfx_1_G_H""","""6zfx""","""H""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""Q6SZW1""","""Q6SZW1""","""SARM1_HUMAN""","""23098""","""NP_055892.2""","[""83288284"", "" 7711002"", … "" 154090976""]","""6O0Q:A; 6O0Q:B; 6O0R:A; 6O0R:B…","[""GO:0030424"", "" GO:0009986"", … "" GO:0007165""]","""UniRef100_Q6SZW1""","""UniRef90_Q6SZW1""","""UniRef50_Q6SZW1""","""UPI000021FB54""",,9606,"""607732""",,"[""11386760"", "" 9628581"", … "" 33053563""]","[""AJ290445"", "" AY444166"", "" AB011096""]","[""CAB90355.1"", "" AAR17520.1"", "" BAA25450.1""]","[""ENSG00000004139.14""]","[""ENST00000585482.6""]","[""ENSP00000468032.2""]","[""15893701"", "" 17804407"", … "" 16306936""]","[0.000058, 0.000053, … 0.0002]","""NAD(+) hydrolase SARM1 (NADase…","""SARM1 KIAA0524 SAMD2 SARM""","""Homo sapiens (Human)""","""SARM1""","""ENSG00000004139.14"""
"""6o0t_1_E_F""","""6o0t""","""E""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""Q6SZW1""","""Q6SZW1""","""SARM1_HUMAN""","""23098""","""NP_055892.2""","[""83288284"", "" 7711002"", … "" 154090976""]","""6O0Q:A; 6O0Q:B; 6O0R:A; 6O0R:B…","[""GO:0030424"", "" GO:0009986"", … "" GO:0007165""]","""UniRef100_Q6SZW1""","""UniRef90_Q6SZW1""","""UniRef50_Q6SZW1""","""UPI000021FB54""",,9606,"""607732""",,"[""11386760"", "" 9628581"", … "" 33053563""]","[""AJ290445"", "" AY444166"", "" AB011096""]","[""CAB90355.1"", "" AAR17520.1"", "" BAA25450.1""]","[""ENSG00000004139.14""]","[""ENST00000585482.6""]","[""ENSP00000468032.2""]","[""15893701"", "" 17804407"", … "" 16306936""]","[0.000053, 0.000037, … 0.000169]","""NAD(+) hydrolase SARM1 (NADase…","""SARM1 KIAA0524 SAMD2 SARM""","""Homo sapiens (Human)""","""SARM1""","""ENSG00000004139.14"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""6ud0_1_C_B""","""6ud0""","""B""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""P0CG47""","""P0CG47""","""UBB_HUMAN""","""7314""","""NP_001268645.1; NP_001268646.1…","[""931139757"", "" 955264916"", … "" 757818933""]","""2KHW:B; 2MBB:B; 2MRO:A; 2MSG:A…","[""GO:0005737"", "" GO:0005829"", … "" GO:0072520""]","""UniRef100_P0CG47""","""UniRef90_P0CG47""","""UniRef50_P0CG47""","""UPI0000002146""","""A26437""",9606,"""191339""",,"[""3029682"", "" 14745543"", … "" 26416742""]","[""X04803"", "" AB089617"", … "" BC046123""]","[""CAA28495.1"", "" BAC56955.1"", … "" AAH46123.1""]","[""ENSG00000170315.14""]","[""ENST00000302182.8"", "" ENST00000395837.1"", … "" ENST00000614404.1""]","[""ENSP00000304697.3"", "" ENSP00000379178.1"", … "" ENSP00000478771.1""]","[""12055595"", "" 12871580"", … "" 37724023""]","[-6.9931e-8, -0.000059, … -0.000153]","""Polyubiquitin-B [Cleaved into:…","""UBB""","""Homo sapiens (Human)""","""UBB""","""ENSG00000170315.14"""
"""7ojx_1_A_C""","""7ojx""","""C""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""P0CG47""","""P0CG47""","""UBB_HUMAN""","""7314""","""NP_001268645.1; NP_001268646.1…","[""931139757"", "" 955264916"", … "" 757818933""]","""2KHW:B; 2MBB:B; 2MRO:A; 2MSG:A…","[""GO:0005737"", "" GO:0005829"", … "" GO:0072520""]","""UniRef100_P0CG47""","""UniRef90_P0CG47""","""UniRef50_P0CG47""","""UPI0000002146""","""A26437""",9606,"""191339""",,"[""3029682"", "" 14745543"", … "" 26416742""]","[""X04803"", "" AB089617"", … "" BC046123""]","[""CAA28495.1"", "" BAC56955.1"", … "" AAH46123.1""]","[""ENSG00000170315.14""]","[""ENST00000302182.8"", "" ENST00000395837.1"", … "" ENST00000614404.1""]","[""ENSP00000304697.3"", "" ENSP00000379178.1"", … "" ENSP00000478771.1""]","[""12055595"", "" 12871580"", … "" 37724023""]","[-8.5373e-7, -0.000051, … -0.000071]","""Polyubiquitin-B [Cleaved into:…","""UBB""","""Homo sapiens (Human)""","""UBB""","""ENSG00000170315.14"""
"""5jtv_1_A_B""","""5jtv""","""B""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""P0CG47""","""P0CG47""","""UBB_HUMAN""","""7314""","""NP_001268645.1; NP_001268646.1…","[""931139757"", "" 955264916"", … "" 757818933""]","""2KHW:B; 2MBB:B; 2MRO:A; 2MSG:A…","[""GO:0005737"", "" GO:0005829"", … "" GO:0072520""]","""UniRef100_P0CG47""","""UniRef90_P0CG47""","""UniRef50_P0CG47""","""UPI0000002146""","""A26437""",9606,"""191339""",,"[""3029682"", "" 14745543"", … "" 26416742""]","[""X04803"", "" AB089617"", … "" BC046123""]","[""CAA28495.1"", "" BAC56955.1"", … "" AAH46123.1""]","[""ENSG00000170315.14""]","[""ENST00000302182.8"", "" ENST00000395837.1"", … "" ENST00000614404.1""]","[""ENSP00000304697.3"", "" ENSP00000379178.1"", … "" ENSP00000478771.1""]","[""12055595"", "" 12871580"", … "" 37724023""]","[0.000056, 0.000052, … 0.00022]","""Polyubiquitin-B [Cleaved into:…","""UBB""","""Homo sapiens (Human)""","""UBB""","""ENSG00000170315.14"""
"""8blo_1_A_C""","""8blo""","""A""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""Q15849""","""Q15849""","""UT2_HUMAN""","""8170""","""NP_001229621.1; NP_001358248.1…","[""2499093"", "" 158258276"", … "" 1034604811""]","""8BLO:A; 8BLO:B; 8BLO:C; 8XD7:A…","[""GO:0016324"", "" GO:0016020"", … "" GO:0015840""]","""UniRef100_Q15849""","""UniRef90_Q15849""","""UniRef50_Q15849""","""UPI000013CE99""","""S71339""",9606,"""601611""",,"[""8647271"", "" 11502588"", … "" 17702749""]","[""X96969"", "" AF349446"", … "" BC110446""]","[""CAA65657.1"", "" AAL08485.1"", … "" AAI10447.1""]","[""ENSG00000132874.15""]","[""ENST00000255226.11"", "" ENST00000586448.5""]","[""ENSP00000255226.5"", "" ENSP00000465953.1""]","[""11590132"", "" 17344938"", … "" 7989337""]","[0.000044, 0.000044, … 0.0002]","""Urea transporter 2 (Solute car…","""SLC14A2 HUT2 UT2""","""Homo sapiens (Human)""","""SLC14A2""","""ENSG00000132874.15"""


In [12]:
adv.filter(pl.col("uniprot_protein_names").str.contains("IGF1"))

id,pdb_id,chain_id,organism,common_name,taxonomy_id,classification,max_longevity_yrs,kingdom,phylum,uniprot_ids,UniProtKB_AC,UniProtKB_ID,GeneID_EntrezGene,RefSeq,GI,PDB,GO,UniRef100,UniRef90,UniRef50,UniParc,PIR,NCBI_taxon,MIM,UniGene,PubMed,EMBL,EMBL_CDS,Ensembl,Ensembl_TRS,Ensembl_PRO,Additional_PubMed,graph_embedding,uniprot_protein_names,uniprot_gene_names,uniprot_organism_name,primary_gene_name,ensembl_id
str,str,str,str,str,i64,str,f64,str,str,str,str,str,str,str,list[str],str,list[str],str,str,str,str,str,i64,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[f64],str,str,str,str,str


In [35]:
pp_extended.count().collect()

: 

In [34]:
pp_advanced.count().collect()

id,pdb_id,chain_id,organism,common_name,taxonomy_id,classification,max_longevity_yrs,kingdom,phylum,uniprot_ids,UniProtKB_AC,UniProtKB_ID,GeneID_EntrezGene,RefSeq,GI,PDB,GO,UniRef100,UniRef90,UniRef50,UniParc,PIR,NCBI_taxon,MIM,UniGene,PubMed,EMBL,EMBL_CDS,Ensembl,Ensembl_TRS,Ensembl_PRO,Additional_PubMed,graph_embedding,uniprot_protein_names,uniprot_gene_names,uniprot_organism_name,primary_gene_name,ensembl_id
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
113974,113974,113974,113974,113974,113974,113974,113832,113974,113974,113974,113974,113974,109794,111092,113877,113857,113680,113974,113974,113974,113974,91991,113974,80003,0,113910,113215,113215,90218,90218,90218,111023,113974,62359,62323,62359,113172,90218


In [32]:
pp_advanced.select("primary_gene_name").unique().head().collect()

primary_gene_name
str
"""UBE2W"""
"""MLST8"""
"""ompC"""
"""Ctnna2"""
"""ETFB"""


In [28]:
pp_advanced.head().collect()

entry_id,pdb_id,chain_id,protein_name,organism,common_name,taxonomy_id,classification,max_longevity_yrs,kingdom,phylum,uniprot_ids,UniProtKB_AC,UniProtKB_ID,GeneID_EntrezGene,RefSeq,GI,PDB,GO,UniRef100,UniRef90,UniRef50,UniParc,PIR,NCBI_taxon,MIM,UniGene,PubMed,EMBL,EMBL_CDS,Ensembl,Ensembl_TRS,Ensembl_PRO,Additional_PubMed,uniprot_gene_names,uniprot_organism_name,ensembl_id,primary_gene_name
str,str,str,str,str,str,i64,str,f64,str,str,str,str,str,str,str,list[str],str,list[str],str,str,str,str,str,i64,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],str,str,str,str
"""7um4_1_A_B""","""7um4""","""B""","""N/A (from TSV)""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""Q9V2J8""","""Q9V2J8""","""Q9V2J8_PYRAB""",,"""WP_010867201.1""","[""83754399"", "" 283806864"", … "" 83754400""]","""2BFW:A; 2BIS:A; 2BIS:B; 2BIS:C…","[""GO:0004373""]","""UniRef100_Q9V2J8""","""UniRef90_Q9V2J8""","""UniRef50_C5A4D7""","""UPI00000346E2""","""A75194""",272844,,,"[""10736225"", "" 11381026"", … "" 39695122""]","[""AJ248283"", "" HE613800""]","[""CAB49000.1"", "" CCE69451.1""]",,,,,,,,
"""1gl2_1_A_C""","""1gl2""","""C""","""N/A (from TSV)""","""Balb/c mouse""","""House mouse""",10090,"""Mammalia""",4.0,"""Animalia""","""Chordata""","""O88384""","""O88384""","""VTI1B_MOUSE""",,,"[""13124606"", "" 18655560"", … "" 3213229""]","""1GL2:C; 2QYW:A""","[""GO:0031901"", "" GO:0005794"", … "" GO:0016192""]","""UniRef100_O88384""","""UniRef90_O88384""","""UniRef50_O88384""","""UPI0000029675""",,10090,,,"[""9553086"", "" 15363411"", … "" 11786915""]","[""AF035208""]","[""AAC23483.1""]",,,,"[""12861006"", "" 15640147"", … "" 23258225""]",,,,
"""1gl2_1_C_B""","""1gl2""","""C""","""N/A (from TSV)""","""Balb/c mouse""","""House mouse""",10090,"""Mammalia""",4.0,"""Animalia""","""Chordata""","""O88384""","""O88384""","""VTI1B_MOUSE""",,,"[""13124606"", "" 18655560"", … "" 3213229""]","""1GL2:C; 2QYW:A""","[""GO:0031901"", "" GO:0005794"", … "" GO:0016192""]","""UniRef100_O88384""","""UniRef90_O88384""","""UniRef50_O88384""","""UPI0000029675""",,10090,,,"[""9553086"", "" 15363411"", … "" 11786915""]","[""AF035208""]","[""AAC23483.1""]",,,,"[""12861006"", "" 15640147"", … "" 23258225""]",,,,
"""1gl2_1_D_C""","""1gl2""","""C""","""N/A (from TSV)""","""Balb/c mouse""","""House mouse""",10090,"""Mammalia""",4.0,"""Animalia""","""Chordata""","""O88384""","""O88384""","""VTI1B_MOUSE""",,,"[""13124606"", "" 18655560"", … "" 3213229""]","""1GL2:C; 2QYW:A""","[""GO:0031901"", "" GO:0005794"", … "" GO:0016192""]","""UniRef100_O88384""","""UniRef90_O88384""","""UniRef50_O88384""","""UPI0000029675""",,10090,,,"[""9553086"", "" 15363411"", … "" 11786915""]","[""AF035208""]","[""AAC23483.1""]",,,,"[""12861006"", "" 15640147"", … "" 23258225""]",,,,
"""7xrb_1_A_B""","""7xrb""","""A""","""N/A (from TSV)""","""Home sapiens""","""Human""",9606,"""Mammalia""",122.5,"""Animalia""","""Chordata""","""P49842""","""P49842""","""WHR1_HUMAN""","""8859""","""NP_004188.1; NP_115830.1""","[""2347132"", "" 4759180"", … "" 825661""]","""7XRB:A; 7XRB:B; 8YCM:A; 8YCM:B…","[""GO:0005737"", "" GO:0016607"", … "" GO:1903518""]","""UniRef100_P49842""","""UniRef90_P49842""","""UniRef50_P49842""","""UPI000002B442""","""B53439""",9606,"""604977""",,"[""8012361"", "" 8132574"", … "" 17344846""]","[""X77386"", "" X77474"", … "" U24578""]","[""CAA54565.1"", "" CAA54622.1"", … "" AAA99716.1""]","[""ENSG00000204344.16"", "" ENSG00000206342.12"", … "" ENSG00000226033.11""]","[""ENST00000375331.8"", "" ENST00000375333.4"", … "" ENST00000685781.1""]","[""ENSP00000364480.4"", "" ENSP00000364482.4"", … "" ENSP00000509445.1""]","[""19423540"", "" 19851445"", … "" 32531245""]",,,"""ENSG00000204344.16""","""ENSG00000204344"""


## MAPPING TSV TO PARQUET


In [None]:
mapping_path = input / "uniprot" / "idmapping_selected.tab"
mapping_path.exists()

True

In [None]:
# Read the parquet file lazily and inspect schema
mapping_parquet = output / "idmapping_selected.parquet"

# Lazy read
mapping_lazy = pl.scan_parquet(mapping_parquet)

# Check schema without loading all data
print("Parquet schema:")
schema = mapping_lazy.collect_schema()
print(schema)
print(f"\nTotal columns: {len(schema)}")

# Inspect a few rows
print("\nFirst few rows:")
print(mapping_lazy.head(5).collect())

