In [2]:
from automatic_variable_mapping import corpus, vocab_similarity
import pandas as pd
from functools import partial
import time
import numpy as np

In [3]:
from automatic_variable_mapping.vocab_similarity import default_pairable, partition
num_cpus = 14

In [14]:
reload(corpus)

<module 'automatic_variable_mapping.corpus' from 'automatic_variable_mapping/corpus.py'>

In [5]:
def matching_groups(data, group_col, corpora, pair_id, ref_id):
    corpus_doc_ids = [doc_id for doc_id, _ in corpora]
    ref_idx = corpus_doc_ids.index(ref_id)
    pair_idx = corpus_doc_ids.index(pair_id)
    return data[pair_idx][group_col] == data[ref_idx][group_col]


def pairable_by_group(data, group_col, corpus_doc_ids, score, pair_id, _, ref_id):
    return vocab_similarity.default_pairable(score, pair_id, None, ref_id) and not matching_groups(data, group_col, corpus_doc_ids, pair_id, ref_id)


def calc_score_results(data_file, doc_cols, ref_id_col, filter_file, mult_corpora=False, corpora_col=None):
    data = pd.read_csv(data_file,
                       sep=",",
                       quotechar='"',
                       na_values="",
                       low_memory=False)
    if mult_corpora:
        corpora_data = partition(data, corpora_col)
    else:
        corpora_data = [data]
    
    if filter_file != data_file:
        filter_data = pd.read_csv(filter_file,
                                  sep=",",
                                  quotechar='"',
                                  na_values="",
                                  low_memory=False)
    else:
        filter_data = data

    #idc = list(doc_cols)
    #idc.append(ref_id_col)
    #print corpora_data[0][idc]
    corpora = corpus.build_corpora(doc_cols, corpora_data, ref_id_col, num_cpus=num_cpus)
    tfidf_matrix = corpus.calc_tfidf(corpora)

    scores = vocab_similarity.VariableSimilarityCalculator(filter_data[ref_id_col],
                                                           pairable=default_pairable)

    scores.init_cache()
    if mult_corpora:
        scores.score_variables(corpora, tfidf_matrix)
    else:
        scores.score_variables(corpora[0], tfidf_matrix)
    return(scores.cache)

In [6]:
obs_data_file = "~/Dropbox/tiff_laura_shared/var_doc_obs_heart_studies_dbGaP_NLP.csv"
obs_man_file = "~/Dropbox/tiff_laura_shared/manual_concept_var_mappings_dbGaP_obs_heart_studies_NLP.csv"
ref_id_col = 'dbGaP_studyID_datasetID_varID'
doc_cols_inputs = {'desc': ['variable_description'],
                   'units': ['units'],
                   'coding': ['var_coding_counts_distribution'],
                   'desc_units': ['variable_description', 'units'],
                   'desc_coding': ['variable_description', 'units', 'variable_coding_counts_distribution'],
                   'desc_units_coding': ['variable_description', 'units', 'variable_coding_counts_distribution']}


In [7]:
dt = pd.read_csv(obs_man_file,
                       sep=",",
                       quotechar='"',
                       na_values="",
                       low_memory=False)
dt
dt.columns

Index([u'study', u'cohort_dbGaP', u'dbGaP_studyID_datasetID',
       u'dbGaP_dataset_label', u'concept', u'data_desc', u'varID', u'var_desc',
       u'units', u'var_coding_counts_distribution', u'var_doc_id',
       u'concept_id', u'dbGaP_studyID_datasetID_varID', u'var_coding_labels'],
      dtype='object')

In [12]:
def calc_scores_doc_cols(data_file, doc_cols_inputs, ref_id_col, filter_file, mult_corpora=False, corpora_col=None):
    scores_dfs = list()
    for key in doc_cols_inputs:
        score_name = "score_" + key
        scores_df = calc_score_results(data_file, doc_cols_inputs[key], ref_id_col, filter_file, mult_corpora, corpora_col)
        scores_df = scores_df.rename({'score': score_name}, axis=1)
        scores_dfs.append(scores_df)

    #scores_merged = reduce(lambda left, right: pd.merge(left, right, on=[ref_id_col],
    #                                               how='outer'), scores_dfs)
    #return(scores_merged)
    return(scores_dfs)

In [None]:
obs_scores_tfidf = calc_scores_doc_cols(obs_data_file, doc_cols_inputs, ref_id_col, obs_man_file)

#obs_scores_tfmcdf = calc_scores_doc_cols(obs_data_file, doc_cols_inputs, ref_id_col, obs_man_file, mult_corpora=True, corpora_col='study_1')

  pool = multiprocessing.Pool(processes=num_cpus)
100%|██████████| 105611/105611 [00:34<00:00, 3031.22it/s]


Finding valid pair ids
Pair ids: 105611
Finding valid ref ids
Ref ids: 1709
Multiplying matrices
LHS: (105611, 60261)
RHS: (1709, 60261)
Sim Matrix: (1709, 105611)
Finding matches for 0 1 phs000007.v26.pht000009.v2.MF5
Finding matches for 1 15 phs000007.v26.pht000009.v2.MF20
Finding matches for 2 557 phs000007.v26.pht000009.v2.shareid
Finding matches for 3 708 phs000007.v26.pht000010.v3.shareid
Finding matches for 4 709 phs000007.v26.pht000011.v3.FB8
Finding matches for 5 716 phs000007.v26.pht000011.v3.FB18
Finding matches for 6 717 phs000007.v26.pht000011.v3.FB19
Finding matches for 7 718 phs000007.v26.pht000011.v3.FB20
Finding matches for 8 719 phs000007.v26.pht000011.v3.FB21
Finding matches for 9 720 phs000007.v26.pht000011.v3.FB22
Finding matches for 10 721 phs000007.v26.pht000011.v3.FB23
Finding matches for 11 726 phs000007.v26.pht000011.v3.FB28
Finding matches for 12 727 phs000007.v26.pht000011.v3.FB29
Finding matches for 13 728 phs000007.v26.pht000011.v3.FB30
Finding matches for

Finding matches for 134 3179 phs000007.v26.pht000021.v3.FL021
Finding matches for 135 3180 phs000007.v26.pht000021.v3.FL022
Finding matches for 136 3192 phs000007.v26.pht000021.v3.FL291
Finding matches for 137 3193 phs000007.v26.pht000021.v3.FL292
Finding matches for 138 3194 phs000007.v26.pht000021.v3.FL388
Finding matches for 139 3195 phs000007.v26.pht000021.v3.FL389
Finding matches for 140 3196 phs000007.v26.pht000021.v3.FL392
Finding matches for 141 3334 phs000007.v26.pht000021.v3.FL186
Finding matches for 142 3344 phs000007.v26.pht000021.v3.FL196
Finding matches for 143 3345 phs000007.v26.pht000021.v3.FL197
Finding matches for 144 3453 phs000007.v26.pht000021.v3.FL332
Finding matches for 145 3506 phs000007.v26.pht000021.v3.FL393
Finding matches for 146 3518 phs000007.v26.pht000021.v3.FL406
Finding matches for 147 3519 phs000007.v26.pht000021.v3.FL407
Finding matches for 148 3581 phs000007.v26.pht000021.v3.FL517
Finding matches for 149 3582 phs000007.v26.pht000021.v3.FL519
Finding 

Finding matches for 267 7455 phs000007.v26.pht000031.v7.B357
Finding matches for 268 7857 phs000007.v26.pht000032.v6.shareid
Finding matches for 269 7920 phs000007.v26.pht000032.v6.C67
Finding matches for 270 7930 phs000007.v26.pht000032.v6.C77
Finding matches for 271 7931 phs000007.v26.pht000032.v6.C78
Finding matches for 272 8017 phs000007.v26.pht000032.v6.C184
Finding matches for 273 8018 phs000007.v26.pht000032.v6.C185
Finding matches for 274 8058 phs000007.v26.pht000032.v6.C225
Finding matches for 275 8108 phs000007.v26.pht000032.v6.C286
Finding matches for 276 8109 phs000007.v26.pht000032.v6.C287
Finding matches for 277 8111 phs000007.v26.pht000032.v6.C289
Finding matches for 278 8112 phs000007.v26.pht000032.v6.C290
Finding matches for 279 8124 phs000007.v26.pht000032.v6.C302
Finding matches for 280 8125 phs000007.v26.pht000032.v6.C303
Finding matches for 281 8152 phs000007.v26.pht000032.v6.C330
Finding matches for 282 8155 phs000007.v26.pht000032.v6.C333
Finding matches for 283 

Finding matches for 397 10407 phs000007.v26.pht000041.v6.DIAB3
Finding matches for 398 10408 phs000007.v26.pht000041.v6.DIAB4
Finding matches for 399 10409 phs000007.v26.pht000041.v6.DIAB5
Finding matches for 400 10410 phs000007.v26.pht000041.v6.DIAB6
Finding matches for 401 10411 phs000007.v26.pht000041.v6.DIAB7
Finding matches for 402 10628 phs000007.v26.pht000042.v3.shareid
Finding matches for 403 10648 phs000007.v26.pht000043.v7.shareid
Finding matches for 404 10731 phs000007.v26.pht000044.v2.shareid
Finding matches for 405 10732 phs000007.v26.pht000074.v9.shareid
Finding matches for 406 11286 phs000007.v26.pht000076.v6.shareid
Finding matches for 407 11305 phs000007.v26.pht000077.v6.shareid
Finding matches for 408 11324 phs000007.v26.pht000078.v6.shareid
Finding matches for 409 11403 phs000007.v26.pht000079.v6.shareid
Finding matches for 410 11406 phs000007.v26.pht000080.v6.shareid
Finding matches for 411 11410 phs000007.v26.pht000081.v6.shareid
Finding matches for 412 11413 phs00

Finding matches for 524 14362 phs000007.v26.pht000393.v6.shareid
Finding matches for 525 14367 phs000007.v26.pht000394.v7.shareid
Finding matches for 526 14453 phs000007.v26.pht000395.v8.shareid
Finding matches for 527 15791 phs000007.v26.pht000395.v8.BMI_s1
Finding matches for 528 15839 phs000007.v26.pht000396.v4.shareid
Finding matches for 529 15926 phs000007.v26.pht000397.v3.shareid
Finding matches for 530 16037 phs000007.v26.pht000397.v3.bmi_s2
Finding matches for 531 17194 phs000007.v26.pht000602.v3.shareid
Finding matches for 532 17199 phs000007.v26.pht000603.v2.shareid
Finding matches for 533 17200 phs000007.v26.pht000604.v3.shareid
Finding matches for 534 17205 phs000007.v26.pht000605.v3.shareid
Finding matches for 535 17211 phs000007.v26.pht000606.v3.shareid
Finding matches for 536 17222 phs000007.v26.pht000607.v3.shareid
Finding matches for 537 17892 phs000007.v26.pht000608.v2.shareid
Finding matches for 538 17896 phs000007.v26.pht000609.v3.shareid
Finding matches for 539 180

Finding matches for 652 21441 phs000007.v26.pht001894.v3.shareid
Finding matches for 653 21457 phs000007.v26.pht002072.v5.shareid
Finding matches for 654 21461 phs000007.v26.pht002073.v4.shareid
Finding matches for 655 21468 phs000007.v26.pht002074.v3.shareid
Finding matches for 656 21637 phs000007.v26.pht002075.v4.shareid
Finding matches for 657 21639 phs000007.v26.pht002076.v5.shareid
Finding matches for 658 21647 phs000007.v26.pht002077.v5.shareid
Finding matches for 659 21987 phs000007.v26.pht002078.v5.shareid
Finding matches for 660 22046 phs000007.v26.pht002080.v2.shareid
Finding matches for 661 22047 phs000007.v26.pht002141.v4.shareid
Finding matches for 662 22056 phs000007.v26.pht002142.v4.shareid
Finding matches for 663 22065 phs000007.v26.pht002143.v4.shareid
Finding matches for 664 22074 phs000007.v26.pht002144.v3.shareid
Finding matches for 665 22090 phs000007.v26.pht002145.v3.shareid
Finding matches for 666 22102 phs000007.v26.pht002146.v4.shareid
Finding matches for 667 2

Finding matches for 780 27204 phs000007.v26.pht003099.v4.age24
Finding matches for 781 27207 phs000007.v26.pht003099.v4.age25
Finding matches for 782 27210 phs000007.v26.pht003099.v4.age26
Finding matches for 783 27213 phs000007.v26.pht003099.v4.age27
Finding matches for 784 27216 phs000007.v26.pht003099.v4.age28
Finding matches for 785 27219 phs000007.v26.pht003099.v4.age29
Finding matches for 786 27222 phs000007.v26.pht003099.v4.age30
Finding matches for 787 27225 phs000007.v26.pht003099.v4.age31
Finding matches for 788 27228 phs000007.v26.pht003099.v4.age32
Finding matches for 789 27229 phs000007.v26.pht003099.v4.age20
Finding matches for 790 27230 phs000007.v26.pht003309.v3.shareid
Finding matches for 791 27236 phs000007.v26.pht003310.v2.shareid
Finding matches for 792 27304 phs000007.v26.pht003311.v2.shareid
Finding matches for 793 27308 phs000007.v26.pht003312.v1.shareid
Finding matches for 794 27332 phs000007.v26.pht003315.v4.shareid
Finding matches for 795 27334 phs000007.v26.p

Finding matches for 908 55581 phs000287.v5.pht001451.v1.COFFEE25
Finding matches for 909 55630 phs000287.v5.pht001451.v1.LDLADJ
Finding matches for 910 55633 phs000287.v5.pht001452.v1.Individual_ID
Finding matches for 911 55649 phs000287.v5.pht001452.v1.GRADE01
Finding matches for 912 55650 phs000287.v5.pht001452.v1.GEND01
Finding matches for 913 55651 phs000287.v5.pht001452.v1.MARIT01
Finding matches for 914 55672 phs000287.v5.pht001452.v1.AFIB
Finding matches for 915 55697 phs000287.v5.pht001452.v1.PRGNT
Finding matches for 916 55734 phs000287.v5.pht001452.v1.BMI
Finding matches for 917 55737 phs000287.v5.pht001452.v1.BEAT14
Finding matches for 918 55774 phs000287.v5.pht001452.v1.HDL44
Finding matches for 919 55783 phs000287.v5.pht001452.v1.AVZMSYS
Finding matches for 920 55784 phs000287.v5.pht001452.v1.AVZMDIA
Finding matches for 921 55803 phs000287.v5.pht001452.v1.ECGLVH
Finding matches for 922 55814 phs000287.v5.pht001452.v1.SMOKE
Finding matches for 923 55817 phs000287.v5.pht0014

Finding matches for 1033 60466 phs000287.v5.pht001490.v1.MARIT01
Finding matches for 1034 60467 phs000287.v5.pht001490.v1.GRADE01


In [None]:
#TO DO add standard and add same above but for clinical trials

#doc_col = list("var_desc_1”, “units_1", “var_coding_counts_distribution_1")
score_file = 'tests/test_var_similarity_scores_rank_data.csv'

In [None]:
orig_out_file_name = "tests/orig_file_out.csv"

comb = pd.merge(orig_data, v.cache, how='left', left_on=['metadataID_1', 'metadataID_2'],
                right_on=['reference var', 'paired var']).round(6)

assert comb.loc[comb['score'] == comb['score_desc']][
           ["score_desc", "score", "reference var", "metadataID_1", "metadataID_2", "paired var"]].shape[0] == \
       orig_data.shape[0]