# Mapping analysis for datasets in the Mouse-Human mapping effort

More information can be found here: https://github.com/obophenotype/mp_hp_mapping

In [69]:
import datetime
import pandas as pd
import numpy as np
import pathlib

print(f"Last run: {datetime.datetime.now()}")

Last run: 2020-12-10 16:21:04.986420


In [46]:
human_phenotype = "human_phenotype"
mammalian_phenotype = "mammalian_phenotype"
obo_iri_prefix = "http://purl.obolibrary.org/obo/"
mp_iri_prefix = "http://purl.obolibrary.org/obo/MP_"
hp_iri_prefix = "http://purl.obolibrary.org/obo/HP_"

In [40]:
use_case_dir = "use_cases"
sources_dir = "sources"

use_cases = {
    "kids_first": pathlib.Path.cwd().joinpath(use_case_dir, "April2020_KF_Data_Phenotypes_HPO.csv")
}
raw_mapping_data = {
    "upheno_logical": pathlib.Path.cwd().joinpath(sources_dir,"upheno", "upheno_mapping_logical.csv"),
    "upheno_lexical": pathlib.Path.cwd().joinpath(sources_dir,"upheno", "upheno_mapping_lexical.csv")
}
                                 

# Load data
## Load use case data
### Load Kids First data

In [41]:
df_kids_first = pd.read_csv(use_cases['kids_first'])
df_kids_first[human_phenotype]=df_kids_first['hpo_id_phenotype']
df_kids_first = df_kids_first[human_phenotype]
df_kids_first

0      HP:0030319
1      HP:0040106
2      HP:0040064
3      HP:0001252
4      HP:0001999
          ...    
782    HP:0000807
783    HP:0011805
784    HP:0010741
785    HP:0002992
786    HP:0009736
Name: human_phenotype, Length: 787, dtype: object

## Load mapping data
### Load upheno mappings

In [77]:
def extract_upheno_mappings(df_upheno):
    df_upheno = df_upheno[["p1", "p2"]]
    df_upheno = df_upheno[(df_upheno.p1.str.startswith(hp_iri_prefix)) & (df_upheno.p2.str.startswith(mp_iri_prefix))]
    df_upheno["p1"] = df_upheno.p1.str.replace(obo_iri_prefix,"")
    df_upheno["p1"] = df_upheno.p1.str.replace("_",":")
    df_upheno["p2"] = df_upheno.p2.str.replace("_",":")
    df_upheno["p2"] = df_upheno.p2.str.replace(obo_iri_prefix,"")
    df_upheno.columns = [human_phenotype,mammalian_phenotype]
    return df_upheno

dfm_upheno_logical = extract_upheno_mappings(pd.read_csv(raw_mapping_data['upheno_logical']))
dfm_upheno_lexical = extract_upheno_mappings(pd.read_csv(raw_mapping_data['upheno_lexical']))
print(len(dfm_upheno_logical))
print(len(dfm_upheno_lexical))
print(dfm_upheno_logical.head())
print(dfm_upheno_lexical.head())

929
1711
   human_phenotype mammalian_phenotype
2       HP:0012091          MP:0002693
11      HP:0002208          MP:0002832
21      HP:0003537          MP:0008822
23      HP:0002558          MP:0009723
29      HP:3000052          MP:0003056
  human_phenotype mammalian_phenotype
0      HP:0000347          MP:0002639
2      HP:0000347          MP:0004592
5      HP:0000327          MP:0004540
6      HP:0005736          MP:0002764
9      HP:0003270          MP:0009247


# Map data
## Kids First

In [78]:
def merge_mappings(df,dfm,mapping="mapping"):
    df = df.merge(dfm, how="left")
    df[mapping] = ~df['mammalian_phenotype'].isna()
    return df

df_kids_first_mapped = merge_mappings(df_kids_first.to_frame(), dfm_upheno_logical,"upheno_logical")
df_kids_first_mapped = merge_mappings(df_kids_first_mapped, dfm_upheno_lexical,"upheno_lexical")
df_kids_first_mapped.to_csv("df_kids_first_mapped.csv")
df_kids_first_mapped

Unnamed: 0,human_phenotype,mammalian_phenotype,upheno_logical,upheno_lexical
0,HP:0030319,,False,False
1,HP:0040106,,False,False
2,HP:0040064,,False,False
3,HP:0001252,,False,False
4,HP:0001999,MP:0003743,True,True
...,...,...,...,...
782,HP:0000807,,False,False
783,HP:0011805,,False,False
784,HP:0010741,,False,False
785,HP:0002992,MP:0000558,True,True


In [66]:
df_kids_first_mapped.upheno_logical.value_counts()

False    663
True     124
Name: upheno_logical, dtype: int64

In [65]:
df_kids_first_mapped.upheno_lexical.value_counts()

False    663
True     124
Name: upheno_lexical, dtype: int64

In [70]:
df_kids_first_mapped['combined']=np.where( ( (df_kids_first_mapped.upheno_logical) | (df_kids_first_mapped.upheno_lexical)), True, False) 
df_kids_first_mapped

Unnamed: 0,human_phenotype,mammalian_phenotype,upheno_logical,upheno_lexical,combined
0,HP:0030319,,False,False,False
1,HP:0040106,,False,False,False
2,HP:0040064,,False,False,False
3,HP:0001252,,False,False,False
4,HP:0001999,MP:0003743,True,True,True
...,...,...,...,...,...
782,HP:0000807,,False,False,False
783,HP:0011805,,False,False,False
784,HP:0010741,,False,False,False
785,HP:0002992,MP:0000558,True,True,True


In [72]:
df_kids_first_mapped.combined.value_counts()

False    663
True     124
Name: combined, dtype: int64