# Imports and data loading

In [7]:
import sourmash
import screed
import os
import re
import pandas as pd

import glob
from tqdm import tqdm
from joblib import Parallel, delayed

from kmer_utils import get_encoded_kmer_hashvals

pd.options.display.max_columns = 50

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
outdir = '/Users/olgabot/botryllus/adhoc-analysis/2022-apr--gather-botryllus-in-human-mouse-with-kmers/'
# ! mkdir $outdir

In [3]:
%%time

gather_results = pd.read_parquet(
    os.path.join(outdir, "botryllus_gather_mouse_human_results_with_e_values.parquet")
)
print(gather_results.shape)
gather_results.head()

(19342484, 25)
CPU times: user 50.7 s, sys: 1min 8s, total: 1min 59s
Wall time: 1min 50s


Unnamed: 0,i_query,kmer_query,kmer_hp,hashval,name_query,i_found,kmer_found,name_found,n_kmers,intersect_bp,...,genomic_coord,all_homologs_found,transcript_id,n_hashes_query,n_hashes_found,bitscore,extreme_value_distribution,containment_scaled,pseudo_e_value,pseudo_e_value_log10
0,150,YSFTLKDDTGEVVLDQWNKASLVP,hphphpppphphhhpphpphphhh,3018107519398277786,g25414.t1 frame:1,1238,PKISLRRSSLKYLGCRYSEIKPYG,ENST00000361833.7 gene_id=ENSG00000152092.16;t...,3,15,...,Chr1:176855118-177164904(-),False,ENST00000361833.7,35,259,2.0,0.62786,1e-06,8960.605767,3.952337
1,151,SFTLKDDTGEVVLDQWNKASLVPG,phphpppphphhhpphpphphhhh,1234479950528962468,g25414.t1 frame:1,1239,KISLRRSSLKYLGCRYSEIKPYGL,ENST00000361833.7 gene_id=ENSG00000152092.16;t...,3,15,...,Chr1:176855118-177164904(-),False,ENST00000361833.7,35,259,2.0,0.62786,1e-06,8960.605767,3.952337
2,152,FTLKDDTGEVVLDQWNKASLVPGK,hphpppphphhhpphpphphhhhp,2563348012698350512,g25414.t1 frame:1,1240,ISLRRSSLKYLGCRYSEIKPYGLD,ENST00000361833.7 gene_id=ENSG00000152092.16;t...,3,15,...,Chr1:176855118-177164904(-),False,ENST00000361833.7,35,259,2.0,0.62786,1e-06,8960.605767,3.952337
3,15,NFHVGVKFVSSPKRGGVSQYLKGI,phphhhphhpphpphhhpphhphh,1046426320533182525,g25414.t1 frame:1,576,SVQPIVKLVSTATTAPPSTAPSGP,ENST00000453269.7 gene_id=ENSG00000106290.15;t...,2,10,...,Chr7:100107070-100119858(-),False,ENST00000453269.7,35,133,2.0,0.626589,3e-06,4585.523688,3.661389
4,16,FHVGVKFVSSPKRGGVSQYLKGIF,hphhhphhpphpphhhpphhphhh,2154375574397474662,g25414.t1 frame:1,577,VQPIVKLVSTATTAPPSTAPSGPG,ENST00000453269.7 gene_id=ENSG00000106290.15;t...,2,10,...,Chr7:100107070-100119858(-),False,ENST00000453269.7,35,133,2.0,0.626589,3e-06,4585.523688,3.661389


### What does BHF match to?

In [4]:
bhf_matches = gather_results.query('name_query == "BHF"')
bhf_matches.head()

Unnamed: 0,i_query,kmer_query,kmer_hp,hashval,name_query,i_found,kmer_found,name_found,n_kmers,intersect_bp,...,genomic_coord,all_homologs_found,transcript_id,n_hashes_query,n_hashes_found,bitscore,extreme_value_distribution,containment_scaled,pseudo_e_value,pseudo_e_value_log10
0,165,DDRFGERLIDRAQNKYAPLDEKQR,ppphhpphhpphppphhhhppppp,13592530723998561865,BHF,154,KSEFLSTAPRSLRKRLIVPRSHSD,ENST00000234296.7 gene_id=ENSG00000115942.9;tr...,6,30,...,Chr2:200908981-200963703(-),False,ENST00000234296.7,40,126,2.0,0.626281,3e-06,4960.631575,3.695537
1,166,DRFGERLIDRAQNKYAPLDEKQRS,pphhpphhpphppphhhhpppppp,18007806196568601670,BHF,155,SEFLSTAPRSLRKRLIVPRSHSDS,ENST00000234296.7 gene_id=ENSG00000115942.9;tr...,6,30,...,Chr2:200908981-200963703(-),False,ENST00000234296.7,40,126,2.0,0.626281,3e-06,4960.631575,3.695537
2,167,RFGERLIDRAQNKYAPLDEKQRSE,phhpphhpphppphhhhppppppp,4680963950811137194,BHF,156,EFLSTAPRSLRKRLIVPRSHSDSE,ENST00000234296.7 gene_id=ENSG00000115942.9;tr...,6,30,...,Chr2:200908981-200963703(-),False,ENST00000234296.7,40,126,2.0,0.626281,3e-06,4960.631575,3.695537
3,168,FGERLIDRAQNKYAPLDEKQRSES,hhpphhpphppphhhhpppppppp,343616811934702161,BHF,157,FLSTAPRSLRKRLIVPRSHSDSES,ENST00000234296.7 gene_id=ENSG00000115942.9;tr...,6,30,...,Chr2:200908981-200963703(-),False,ENST00000234296.7,40,126,2.0,0.626281,3e-06,4960.631575,3.695537
4,169,GERLIDRAQNKYAPLDEKQRSESH,hpphhpphppphhhhppppppppp,3434887395817678525,BHF,158,LSTAPRSLRKRLIVPRSHSDSESE,ENST00000234296.7 gene_id=ENSG00000115942.9;tr...,6,30,...,Chr2:200908981-200963703(-),False,ENST00000234296.7,40,126,2.0,0.626281,3e-06,4960.631575,3.695537


In [14]:
bhf_matches.to_clipboard()

In [11]:
bhf_matches_filtered = bhf_matches.query('pseudo_e_value_log10 > 4 or all_homologs_found == True')
print(bhf_matches_filtered.shape)
bhf_matches_filtered

(40, 25)


Unnamed: 0,i_query,kmer_query,kmer_hp,hashval,name_query,i_found,kmer_found,name_found,n_kmers,intersect_bp,containment,symbol,species,found_i,homolog_group,genomic_coord,all_homologs_found,transcript_id,n_hashes_query,n_hashes_found,bitscore,extreme_value_distribution,containment_scaled,pseudo_e_value,pseudo_e_value_log10
10,37,IFCRILTALHLKKRRTEHDHQKLL,hhpphhphhphppppppppppphh,10240013254256923235,BHF,604,PLRRLGRPPKITTTNENQKTNTVA,ENST00000369577.8 gene_id=ENSG00000188994.13;t...,6,30,0.003591,ZNF292,human,2,ZNF292---Zfp292,Chr6:87155551-87264172(+),False,ENST00000369577.8,40,557,2.0,0.6308,1.611609e-07,22200.143455,4.346356
11,38,FCRILTALHLKKRRTEHDHQKLLS,hpphhphhphppppppppppphhp,8047764348242702868,BHF,605,LRRLGRPPKITTTNENQKTNTVAK,ENST00000369577.8 gene_id=ENSG00000188994.13;t...,6,30,0.003591,ZNF292,human,2,ZNF292---Zfp292,Chr6:87155551-87264172(+),False,ENST00000369577.8,40,557,2.0,0.6308,1.611609e-07,22200.143455,4.346356
12,39,CRILTALHLKKRRTEHDHQKLLSE,pphhphhphppppppppppphhpp,1452226584929511062,BHF,606,RRLGRPPKITTTNENQKTNTVAKQ,ENST00000369577.8 gene_id=ENSG00000188994.13;t...,6,30,0.003591,ZNF292,human,2,ZNF292---Zfp292,Chr6:87155551-87264172(+),False,ENST00000369577.8,40,557,2.0,0.6308,1.611609e-07,22200.143455,4.346356
13,40,RILTALHLKKRRTEHDHQKLLSES,phhphhphppppppppppphhppp,16483354457599739342,BHF,607,RLGRPPKITTTNENQKTNTVAKQE,ENST00000369577.8 gene_id=ENSG00000188994.13;t...,6,30,0.003591,ZNF292,human,2,ZNF292---Zfp292,Chr6:87155551-87264172(+),False,ENST00000369577.8,40,557,2.0,0.6308,1.611609e-07,22200.143455,4.346356
14,41,ILTALHLKKRRTEHDHQKLLSESQ,hhphhphppppppppppphhpppp,14855361501953140786,BHF,608,LGRPPKITTTNENQKTNTVAKQEQ,ENST00000369577.8 gene_id=ENSG00000188994.13;t...,6,30,0.003591,ZNF292,human,2,ZNF292---Zfp292,Chr6:87155551-87264172(+),False,ENST00000369577.8,40,557,2.0,0.6308,1.611609e-07,22200.143455,4.346356
15,42,LTALHLKKRRTEHDHQKLLSESQE,hphhphppppppppppphhppppp,791636867468234754,BHF,609,GRPPKITTTNENQKTNTVAKQEQR,ENST00000369577.8 gene_id=ENSG00000188994.13;t...,6,30,0.003591,ZNF292,human,2,ZNF292---Zfp292,Chr6:87155551-87264172(+),False,ENST00000369577.8,40,557,2.0,0.6308,1.611609e-07,22200.143455,4.346356
16,37,IFCRILTALHLKKRRTEHDHQKLL,hhpphhphhphppppppppppphh,10240013254256923235,BHF,599,PLRRLGRPPKITTTNENQKTNTVA,ENST00000339907.8 gene_id=ENSG00000188994.13;t...,6,30,0.003584,ZNF292,human,3,ZNF292---Zfp292,Chr6:87155551-87264172(+),False,ENST00000339907.8,40,558,2.0,0.630802,1.605838e-07,22240.143198,4.347138
17,38,FCRILTALHLKKRRTEHDHQKLLS,hpphhphhphppppppppppphhp,8047764348242702868,BHF,600,LRRLGRPPKITTTNENQKTNTVAK,ENST00000339907.8 gene_id=ENSG00000188994.13;t...,6,30,0.003584,ZNF292,human,3,ZNF292---Zfp292,Chr6:87155551-87264172(+),False,ENST00000339907.8,40,558,2.0,0.630802,1.605838e-07,22240.143198,4.347138
18,39,CRILTALHLKKRRTEHDHQKLLSE,pphhphhphppppppppppphhpp,1452226584929511062,BHF,601,RRLGRPPKITTTNENQKTNTVAKQ,ENST00000339907.8 gene_id=ENSG00000188994.13;t...,6,30,0.003584,ZNF292,human,3,ZNF292---Zfp292,Chr6:87155551-87264172(+),False,ENST00000339907.8,40,558,2.0,0.630802,1.605838e-07,22240.143198,4.347138
19,40,RILTALHLKKRRTEHDHQKLLSES,phhphhphppppppppppphhppp,16483354457599739342,BHF,602,RLGRPPKITTTNENQKTNTVAKQE,ENST00000339907.8 gene_id=ENSG00000188994.13;t...,6,30,0.003584,ZNF292,human,3,ZNF292---Zfp292,Chr6:87155551-87264172(+),False,ENST00000339907.8,40,558,2.0,0.630802,1.605838e-07,22240.143198,4.347138


In [12]:
bhf_matches_filtered.to_clipboard()

In [5]:
print(bhf_matches.shape)

(151, 25)


In [5]:
bhf_matches[
    ["symbol", "species", "homolog_group", "all_homologs_found"]
].drop_duplicates().to_clipboard()

In [6]:
len(set(gather_results.name_query))

31972

## Read Human MHC gene ids

In [7]:
human_gencode_dir = '/Users/olgabot/botryllus-data/data/gencode/v38'

mhc_id_csv = os.path.join(human_gencode_dir, 'mhc_ids.csv')
mhc_ids = pd.read_csv(mhc_id_csv)
mhc_ids.head()

Unnamed: 0,transcript_id,gene_id,gene_name
0,ENST00000474923.1,ENSG00000198704.9,GPX6
1,ENST00000361902.5,ENSG00000198704.9,GPX6
2,ENST00000612264.4,ENSG00000198704.9,GPX6
3,ENST00000412168.7,ENSG00000224586.7,GPX5
4,ENST00000469384.1,ENSG00000224586.7,GPX5


In [8]:

mhc_gene_names = set(mhc_ids.gene_name)

In [9]:
# mhc_transcript_ids = set(mhc_ids.transcript_id)

# Filter gather results

## Use botryllus genes with a match to human chr6 or mouse chromosome 17

Human MHC region is on chr6, mouse MHC region is on chr17, but the region may be too small - use full chromosomes for matching

In [10]:
gather_results.shape

(19342484, 25)

In [11]:
%%time

gather_results_mhc = gather_results.groupby("name_query").filter(
    lambda x: x.query('species == "human"').genomic_coord.str.startswith("Chr6:").any()
    or x.query('species == "mouse"').genomic_coord.str.startswith("Chr17:").any()
    # or x.transcript_id.isin(mhc_transcript_ids).any()
)
print(gather_results_mhc.shape)
gather_results_mhc.head()

(17150858, 25)
CPU times: user 1min 52s, sys: 13.9 s, total: 2min 6s
Wall time: 2min 14s


Unnamed: 0,i_query,kmer_query,kmer_hp,hashval,name_query,i_found,kmer_found,name_found,n_kmers,intersect_bp,...,genomic_coord,all_homologs_found,transcript_id,n_hashes_query,n_hashes_found,bitscore,extreme_value_distribution,containment_scaled,pseudo_e_value,pseudo_e_value_log10
0,623,DVNLRDQQGKSPIFYAEEQNNLDV,phphpppphpphhhhhppppphph,9053272763448756255,g22187.t1 frame:1,30,RGHLCRTRPTDLVFVVDSSRSVRP,ENST00000373765.5 gene_id=ENSG00000162510.6;tr...,4,20,...,Chr1:30711277-30723585(-),True,ENST00000373765.5,202,85,2.0,0.623465,1.370379e-06,16770.715881,4.224552
1,624,VNLRDQQGKSPIFYAEEQNNLDVM,hphpppphpphhhhhppppphphh,1654375679729437405,g22187.t1 frame:1,31,GHLCRTRPTDLVFVVDSSRSVRPV,ENST00000373765.5 gene_id=ENSG00000162510.6;tr...,4,20,...,Chr1:30711277-30723585(-),True,ENST00000373765.5,202,85,2.0,0.623465,1.370379e-06,16770.715881,4.224552
2,625,NLRDQQGKSPIFYAEEQNNLDVMK,phpppphpphhhhhppppphphhp,2361844746549016494,g22187.t1 frame:1,32,HLCRTRPTDLVFVVDSSRSVRPVE,ENST00000373765.5 gene_id=ENSG00000162510.6;tr...,4,20,...,Chr1:30711277-30723585(-),True,ENST00000373765.5,202,85,2.0,0.623465,1.370379e-06,16770.715881,4.224552
3,626,LRDQQGKSPIFYAEEQNNLDVMKY,hpppphpphhhhhppppphphhph,8601332297558688433,g22187.t1 frame:1,33,LCRTRPTDLVFVVDSSRSVRPVEF,ENST00000373765.5 gene_id=ENSG00000162510.6;tr...,4,20,...,Chr1:30711277-30723585(-),True,ENST00000373765.5,202,85,2.0,0.623465,1.370379e-06,16770.715881,4.224552
4,308,GKEKGTKKSPEIADDVSDNDKADA,hppphpppphphhpphppppphph,4814295539498208365,g22187.t1 frame:1,265,INKRLSKSSATLWNSPSRNRSLQL,ENST00000474796.2 gene_id=ENSG00000116871.16;t...,7,35,...,Chr1:36155965-36180849(+),True,ENST00000474796.2,202,196,2.0,0.628367,2.577309e-07,39190.054231,4.593176


In [12]:
len(set(gather_results_mhc.name_query))

9906

In [13]:
%%time

mhc_parquet =  os.path.join(outdir, "botryllus_gather_mouse_human_results_with_e_values_mhc_chromosomes.parquet")
gather_results_mhc.to_parquet(mhc_parquet)

CPU times: user 32.5 s, sys: 26.8 s, total: 59.3 s
Wall time: 1min 16s


In [14]:
! ls -lha $mhc_parquet

-rw-r--r--  1 olgabot  staff   638M May 21 11:55 /Users/olgabot/botryllus/adhoc-analysis/2022-apr--gather-botryllus-in-human-mouse-with-kmers/botryllus_gather_mouse_human_results_with_e_values_mhc_chromosomes.parquet
