# Entity Stats Analysis

In [1]:
from pymongo import MongoClient
import nltk
import config as cfg
import pandas as pd
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
from collections import Counter

In [2]:
client = MongoClient('localhost:' + str(cfg.mongoDB_Port))

db = client.pub
pub = client.pub.publications
ent = client.pub.entities

In [3]:
mongo_string_search = {'$and': [{'label': 'dataset'}, {'paper_id': {'$exists': True}}]}                
results = db.entities.find(mongo_string_search)
datasets = []
for i, r in enumerate(results):
    datasets.append(r)

In [4]:
mongo_string_search = {'$and': [{'label': 'method'}, {'paper_id': {'$exists': True}}]}                
results = db.entities.find(mongo_string_search)
methods = []
for i, r in enumerate(results):
    methods.append(r)

In [6]:
ds_names = []
for ds in datasets:
    ds_names.append(ds['filtered_words'])
ds_count = Counter(ds_names)

In [7]:
ds_count.most_common(15)

[('TREC', 1792),
 ('VLDB', 1336),
 ('Microsoft', 676),
 ('PageRank', 486),
 ('SIGIR', 443),
 ('BM25', 384),
 ('Intel', 332),
 ('Lucene', 331),
 ('WordNet', 327),
 ('Wikipedia', 310),
 ('Jaccard', 297),
 ('Google', 287),
 ('XML', 278),
 ('SPARQL', 275),
 ('XPath', 265)]

In [8]:
ds_names = []
for ds in datasets:
    if ds['dssimilarity'] > ds['mtsimilarity']:
        ds_names.append(ds['filtered_words'])
ds_count = Counter(ds_names)

In [144]:
ds_count.most_common(30)

[('TREC', 1792),
 ('VLDB', 1336),
 ('Microsoft', 676),
 ('SIGIR', 443),
 ('Intel', 332),
 ('Lucene', 331),
 ('WordNet', 327),
 ('Wikipedia', 310),
 ('Google', 287),
 ('XML', 278),
 ('SPARQL', 275),
 ('RDBMS', 252),
 ('JavaScript', 241),
 ('URIs', 206),
 ('DBLP', 201),
 ('OLAP', 192),
 ('Twitter', 189),
 ('WHERE', 171),
 ('DEMONSTRATION', 158),
 ('DBpedia', 147),
 ('NIST', 144),
 ('XSLT', 138),
 ('PostgreSQL', 132),
 ('OLTP', 130),
 ('ASCII', 129),
 ('Hadoop', 126),
 ('IEEE', 122),
 ('PubMed', 118),
 ('SMART', 115),
 ('DBMSs', 114)]

In [10]:
mt_names = []
for mt in methods:
    mt_names.append(mt['filtered_words'])
mt_count = Counter(mt_names)

In [11]:
mt_count.most_common(15)

[('TREC', 1917),
 ('HTML', 875),
 ('VLDB', 795),
 ('URLs', 499),
 ('International Wide World', 463),
 ('DBMS', 430),
 ('HTTP', 384),
 ('Web', 340),
 ('National Foundation Science', 331),
 ('Java', 306),
 ('Information  Retrieval', 298),
 ('INTRODUCTION', 284),
 ('Wikipedia', 277),
 ('NIST', 276),
 ('SPARQL', 267)]

In [14]:
mt_names = []
for mt in methods:
    if mt['mtsimilarity'] > mt['dssimilarity']:
        mt_names.append(mt['filtered_words'])
mt_count = Counter(mt_names)

In [146]:
mt_count.most_common(50)

[('DBMS', 430),
 ('Information  Retrieval', 298),
 ('INTRODUCTION', 284),
 ('Boolean', 223),
 ('PageRank', 202),
 ('learning', 159),
 ('NDCG', 157),
 ('MapReduce', 149),
 ('Jaccard', 145),
 ('Gaussian', 136),
 ('Information Retrieval', 115),
 ('retrieval information', 113),
 ('XPath', 113),
 ('Vector Machines Support', 112),
 ('Dirichlet', 109),
 ('Vector Machine Support', 109),
 ('Bayesian', 101),
 ('XQuery', 97),
 ('Okapi', 84),
 ('Information Intelligent Retrieval', 83),
 ('Vector Model Space', 82),
 ('Cartesian', 77),
 (' retrieval information', 76),
 ('CPUs', 75),
 ('Markov', 74),
 ('machine vector support', 73),
 ('NULL', 72),
 ('machines vector support', 71),
 ('function likelihood', 65),
 ('Euclidean', 64),
 ('HARD', 55),
 ('SVMs', 54),
 ('retrieval  information', 53),
 ('Advancement', 51),
 ('Poisson', 51),
 ('RMSE', 50),
 ('FIFO', 50),
 ('PRELIMINARIES', 49),
 ('Pearson', 49),
 ('Algorithm', 49),
 ('retrieval In information', 48),
 ('Laplacian', 45),
 ('ANOVA', 45),
 ('MBRs',

# ElasticSearch 

In [2]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers

In [3]:
es = Elasticsearch(
    [{'host': 'localhost', 'port': 9200}], timeout=30, max_retries=10, retry_on_timeout=True
)

In [598]:
# es.cluster.health()

In [12]:
res = es.search(index = "ir_full", body = {"query": {"match": {"content" : "imagenet"}}}, size = 5)

print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print(hit['_id'], hit['_source']['title'])
    print(hit['_source']['journal'])

Got 32 Hits:
conf_icra_SunF16 NEOL: Toward Never-Ending Object Learning for robots.
ICRA
conf_icra_GoehringHRSD14 Interactive adaptation of real-time object detectors.
ICRA
journals_ml_MesnilBWCB14 Learning semantic representations of objects and their parts.
Machine Learning
conf_icdm_ZhuangLJXLH15 Representation Learning via Semi-Supervised Autoencoder for Multi-task Learning.
ICDM
journals_ml_WestonBU10 Large scale image annotation: learning to rank with joint word-image embeddings.
Machine Learning


In [21]:
res = es.search(index = "ir", body = {}, size = 5)

print("Got %d Hits:" % res['hits']['total'])

Got 96939 Hits:


In [513]:
res = es.search(index = "ir", body = 
                {
                    "query" : {
                        "bool": {
                            "must": [{"match_phrase" : {"text" : 'latent semantic indexing'}},
                                    {"match_phrase" : {"text" : 'twitter'}}]
                        }
                    }
                }, size = 5)


print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print(hit['_id'], hit['_score'], hit['_source']['title'])
#     print(hit['_source']['text'])
    print('')

Got 4 Hits:
conf_trec_MiyanishiOLSU11 15.950895 TREC 2011 Microblog Track Experiments at Kobe University.

conf_icwsm_HuTK13 14.513167 Dude, srsly?: The Surprisingly Formal Nature of Twitter's Language.

conf_sigir_ChaC12 13.6760025 Social-network analysis using topic models.

conf_sigir_ChaBHC13 13.222706 Incorporating popularity in topic models for social network analysis.



In [628]:
query = 'image classification, imagenet' 
query = 'support vector machine, trec' 
query = 'latent semantic indexing, twitter'
# query = 'crispr cancer gene'

In [632]:
queries = ['image classification, imagenet', 'support vector machine, trec', 
           'latent semantic indexing, twitter', 'word embedding, wikipedia']

In [636]:
for query in queries:
    terms = [x.strip() for x in query.split(',')]
    if len(terms) == 2:
        doc = { "query" : { "bool": { "must": [{"match_phrase" : {"text" : terms[0]}},
                                               {"match_phrase" : {"text" : terms[1]}}]}}}

        doc = json.dumps(doc)
        print(terms)
        res = es.search(index = "ir", body = doc, size = 5)
        print("Got %d Hits:" % res['hits']['total'])
        for hit in res['hits']['hits']:
            print(hit['_id'], hit['_score'], hit['_source']['title'])
    #         print(hit['_source']['text'])
    #         print('')

    print('')

    terms = query.replace(',', '')
    doc = { "query" : { "bool": { "must": [{"match" : {"text" : terms}}]}}}
    doc = json.dumps(doc)
    res = es.search(index = "ir", body = doc, size = 5)
    print(terms)
    print("Got %d Hits:" % res['hits']['total'])
    for hit in res['hits']['hits']:
        print(hit['_id'], hit['_score'], hit['_source']['title'])
    #         print(hit['_source']['text'])
    #     print('')
    print('_'*50)
    print('')

['image classification', 'imagenet']
Got 1 Hits:
conf_sigir_PanYMLNR14 10.419529 Click-through-based cross-view learning for image search.

image classification imagenet
Got 5048 Hits:
conf_sigir_PanYMLNR14 13.019814 Click-through-based cross-view learning for image search.
conf_www_FuMYLR15 12.191383 Tagging Personal Photos with Transfer Deep Learning.
conf_www_MaekawaHN06 8.419228 Image classification for mobile web browsing.
conf_sigir_SrikanthVBM05 8.315823 Exploiting ontologies for automatic image annotation.
conf_sigir_DrewL01 8.30496 Construction of a Hierarchical Classifier Schema Using a Combination of Text-Based and Image-Based Approaches.
__________________________________________________

['support vector machine', 'trec']
Got 45 Hits:
conf_trec_SiK05 12.870042 Thresholding Strategies for Text Classifiers: TREC 2005 Biomedical Triage Task Experiments.
conf_trec_LeeKA02 12.056616 TREC 11 Experiments at NII: The Effects of Virtual Relevant Documents in Batch Filtering.
conf_t

In [211]:
res = es.search(index = "ir", body = {"query": {"common": { 
    "text": {"query": query, "cutoff_frequency": 0.01}}}}, 
    size = 5)

print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print(hit['_id'], hit['_score'], hit['_source']['title'])

Got 5 Hits:
conf_trec_MiyanishiOLSU11 18.708591 TREC 2011 Microblog Track Experiments at Kobe University.
conf_sigir_ChaC12 16.128004 Social-network analysis using topic models.
conf_sigir_ChaBHC13 16.004646 Incorporating popularity in topic models for social network analysis.
conf_icwsm_HuTK13 14.513167 Dude, srsly?: The Surprisingly Formal Nature of Twitter's Language.
conf_jcdl_SalahEldeenN15 14.172043 Predicting Temporal Intention in Resource Sharing.


In [212]:
res = es.search(index = "ir", body = {"query": { "multi_match" : {
    "query" : query, "fields" : [ "title^3", "abstract^2", "text"], 
    "type" : "best_fields", "tie_breaker": 0.2}}}, size = 5)

print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print(hit['_id'], hit['_score'], hit['_source']['title'])

Got 5158 Hits:
conf_sigir_WangXLC11 62.174545 Regularized latent semantic indexing.
conf_sigir_Hofmann99 60.32915 Probabilistic Latent Semantic Indexing.
conf_sigir_Efron07 56.72454 Model-averaged latent semantic indexing.
conf_sigir_YuYT05 54.89106 Multi-label informed latent semantic indexing.
conf_trec_GarronK11 51.606586 Latent Semantic Indexing with selective Query Expansion.


In [213]:
res = es.search(index = "ir", body = {"query": { "multi_match" : {
    "query" : query, "fields" : [ "title^3", "abstract^2", "text"],
    "type" : "most_fields"}}}, size = 5)

print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print(hit['_id'], hit['_score'], hit['_source']['title'])

Got 5158 Hits:
conf_sigir_WangXLC11 73.06489 Regularized latent semantic indexing.
conf_sigir_Hofmann99 72.1853 Probabilistic Latent Semantic Indexing.
conf_sigir_YuYT05 66.35928 Multi-label informed latent semantic indexing.
conf_sigir_Efron07 66.26198 Model-averaged latent semantic indexing.
conf_sigir_Ding99 61.74302 A Similarity-based Probability Model for Latent Semantic Indexing.


In [214]:
res = es.search(index = "ir", body = {"query": { "multi_match" : {
    "query" : query, "fields" : [ "title", "abstract", "text"], 
    "type" : "most_fields", "cutoff_frequency": 0.01}}}, size = 5)

print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print(hit['_id'], hit['_score'], hit['_source']['title'])

Got 314 Hits:
conf_sigir_WangXLC11 19.817318 Regularized latent semantic indexing.
conf_sigir_Hofmann99 19.121704 Probabilistic Latent Semantic Indexing.
conf_trec_MiyanishiOLSU11 18.708591 TREC 2011 Microblog Track Experiments at Kobe University.
conf_sigir_Efron07 18.113392 Model-averaged latent semantic indexing.
conf_sigir_YuYT05 17.341335 Multi-label informed latent semantic indexing.


In [217]:
res = es.search(index = "ir", body = {"query": {"query_string" : {
    "default_field" : "text", "default_operator" : "AND",
    "query" : "('latent semantic indexing') AND (twitter)"}}}, size = 5)

print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print(hit['_id'], hit['_score'], hit['_source']['title'])

Got 5 Hits:
conf_trec_MiyanishiOLSU11 18.708591 TREC 2011 Microblog Track Experiments at Kobe University.
conf_sigir_ChaC12 16.128004 Social-network analysis using topic models.
conf_sigir_ChaBHC13 16.004646 Incorporating popularity in topic models for social network analysis.
conf_icwsm_HuTK13 14.513167 Dude, srsly?: The Surprisingly Formal Nature of Twitter's Language.
conf_jcdl_SalahEldeenN15 14.172043 Predicting Temporal Intention in Resource Sharing.


# Lemmatize

In [305]:
import nltk
from pymongo import MongoClient
from config import mongoDB_Port
from nltk.stem import WordNetLemmatizer, SnowballStemmer, LancasterStemmer
import string
from collections import Counter

nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()
lancaster_stemmer = LancasterStemmer()
snowball_stemmer = SnowballStemmer("english")
tr = str.maketrans("", "", string.punctuation)

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [1086]:
client = MongoClient('localhost:' + str(mongoDB_Port))
db = client.pub
pub = client.pub.publications
es = Elasticsearch(
    [{'host': 'localhost', 'port': 9200}], timeout=30, max_retries=10, retry_on_timeout=True
)
es.cluster.health(wait_for_status='yellow', request_timeout=1)
allentities = db.entities.find()
words = []
stems = []
lemmas = []
cleans = []
x = 0
for rr in allentities:
    word = rr['word'].lower()
    lemma  = wordnet_lemmatizer.lemmatize(word)
    stem = snowball_stemmer.stem(word)
    no_punkt = word.translate(tr)
    clean = ''.join([i for i in no_punkt if not i.isdigit()])
    lower = rr['word'].lower()
    
    db.entities.update_one({'_id' : rr['_id']}, 
                           {"$set" : {'lemma': lemma, 'stem': stem, 
                                      'clean': clean, 'no_punkt': no_punkt,
                                      'word_lower' : lower}}, upsert = False)
    x = x + 1
    if x % 10000 == 0:
        print(x)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000


In [304]:
print(len(words), len(set(words)), len(set(stems)), len(set(lemmas)), len(set(cleans)))

165051 66864 61077 66864 59933


In [318]:
word_count = Counter(words)

In [758]:
client = MongoClient('localhost:' + str(mongoDB_Port))
db = client.pub
pub = client.pub.publications

allentities = db.entities.find()

for rr in allentities:
    try:
        ann = rr['Annotator']
        print(ann)
    except:
        db.entities.update_one({'_id' : rr['_id']}, {"$set" : {'Annotator' : 'undefined'}}, upsert = False)
        pass

method
other
method
method
method
other
method
other
other
other
method
method
method
method
method
dataset
method
method
dataset
other
method
method
method
other
method
method
other
method
method
method
method
method
method
method
method
method
method
method
noise
method
other
other
method
method
method
method
method
method
other
method
method
other
other
method
method
other
method
method
method
method
method
method
method
other
other
method
method
other
other
method
method
method
other
other
method
method
method
other
other
method
method
method
method
other
other
method
method
other
method
method
method
method
method
method
method
other
method
other
method
method
other
other
other
method
method
method
method
method
other
other
method
method
method
method
other
other
other
other
other
other
other
other
other
other
other
method
other
method
method
method
method
method
other
other
method
method
dataset
method
method
other
method
method
method
method
other
other
other
other
other
method


In [1118]:
allentities = db.entities.find({'word_lower': {'$regex' : '.*idf-.*'} , 'Annotator': 'undefined' })
# allentities = db.entities.find({'word_lower': 'header'})

for rr in allentities:
#     db.entities.update_one({'_id' : rr['_id']}, {"$set" : {'word' : 'Spoken Document Retrieval'}}, upsert = False)
    db.entities.update_one({'_id' : rr['_id']}, {"$set" : {'Annotator' : 'method'}}, upsert = False)

# Popular Entities

In [984]:
query = 'ClueWeb09'
id_list = ['conf_trec_PamarthiZB09', 'conf_sigir_PotthastHSGMTW12', 'conf_trec_SmuckerCC09', 'conf_trec_LiXGGYLC12', 'conf_trec_PrasetyaTAM11', 'conf_trec_SerdyukovV09', 'conf_sigir_LeeC13', 'conf_trec_ZhengFW10', 'conf_trec_AlbakourKNF10']
# id_list = ['journals_ml_Gonzalez-CastanoGM04', 'conf_icdm_LuoHGR05', 'conf_icdm_DomeniconiG01', 'journals_ml_FungM05', 'journals_ml_GestelSBVVDMV04', 'conf_sigir_GreevyS04', 'conf_icdm_ChenWD02', 'journals_jmlr_ChenWYZ04', 'conf_icdm_CaiNHD11', 'conf_icarcv_NguyenVHM14', 'conf_icse_BrunE04', 'conf_icarcv_LiuW08', 'conf_ercimdl_BeelGSF10', 'conf_trec_LeeKA02', 'conf_icarcv_WongW04', 'journals_pvldb_GrosseLWFL11', 'conf_icarcv_YangHXL06']
id_list = ['conf_sigir_VosoughiVR16', 'conf_icdm_YuanWX16', 'conf_icra_JainSKSS16', 'conf_icdm_LiuSDTFFYWLZ16', 'conf_icra_LubarskyWWBN16', 'conf_trec_Savenkov15', 'conf_sigir_YanSW16', 'conf_sigir_LeeALHL16', 'conf_icra_OtteWSZ16']


In [638]:
terms_in_results = []

for paper in id_list:
    _query_terms = {
        'query': {
            'match': {
                'paper_id': paper
            }
        }
    }
    
    _query_terms = es.search(index = "surfall_entities", doc_type = "entities", body = _query_terms)
    for doc in _query_terms['hits']['hits']:
        terms_in_results.append(doc['_source']['clean'])

terms_in_results = list(set(terms_in_results))
len(terms_in_results)

47

In [639]:
import operator

entity_occurrences = {}

for entity in terms_in_results:
    _query_occurrences = {
        'query': {
            'match_phrase': {
                'content': entity
            }
        }
    }
    
    _query_occurrences = es.search(index = "surfall", doc_type = "pubs", body = _query_occurrences)
    entity_occurrences[entity] = _query_occurrences['hits']['total']

sorted_occurences = sorted(entity_occurrences.items(), key=operator.itemgetter(1), reverse=True)

# Upcoming Entities

In [640]:
terms_in_results = []

for paper in id_list:
    _query_terms = {
        'query': {
            'match': {
                'paper_id': paper
            }
        }
    }
    
    _query_terms = es.search(index = "surfall_entities", doc_type = "entities", body = _query_terms)
    for doc in _query_terms['hits']['hits']:
        terms_in_results.append(doc['_source']['clean'])

terms_in_results = list(set(terms_in_results))
len(terms_in_results)

47

In [805]:
def popular_upcoming_entities(paper_id_list):
    terms_in_results = []

    for paper in paper_id_list:
        _query_terms = {
            'query': {
                'match': {
                    'paper_id': paper
                }
            }
        }

        _query_terms = es.search(index = "surfall_entities", doc_type = "entities", body = _query_terms)
        for doc in _query_terms['hits']['hits']:
            terms_in_results.append(doc['_source']['clean'])

    terms_in_results = list(set(terms_in_results))
    
    upcoming = []
    entity_occurrences = {}
    
    for entity in terms_in_results:
        _query_occurrences = {
            'query': {
                'match_phrase': {
                    'content': entity
                }
            }
        }

        _query_occurrences = es.search(index = "surfall", doc_type = "pubs", body = _query_occurrences)
        years = []
        entity_occurrences[entity] = _query_occurrences['hits']['total']
        
        for doc in _query_occurrences['hits']['hits']:
            years.append(doc['_source']['year'])
        if years:    
            if int(min(years)) > 2010:
                upcoming.append(entity)

    sorted_occurrences = sorted(entity_occurrences.items(), key=operator.itemgetter(1), reverse=True)
    
    return upcoming, sorted_occurrences

In [1037]:
def popular_upcoming_entities(paper_id_list):
    
    terms_in_results = []
    terms_labels = {}
    entity_occurrences = {}
    upcoming_occurrences = {}
    
    for paper in paper_id_list:
        _query_terms = {
            "query" : {
                "bool": {
                    "must": [{"match" : {'paper_id': paper}}],
                    "should": [{"match" : {"annotator" : 'method'}},
                               {"match" : {"annotator" : 'dataset'}}]
                }
            }
        }

        _query_terms = es.search(index = "surfall_entities", doc_type = "entities", body = _query_terms)
        
        for hit in _query_terms['hits']['hits']:
            entity = hit['_source']['lower']
            terms_in_results.append(entity)
            terms_labels[entity] = hit

    terms_in_results = list(set(terms_in_results))
    
    for entity in terms_in_results:
        _query_occurrences = {
            'query': {
                'match_phrase': {
                    'content': entity
                }
            }
        }

        _query_occurrences = es.search(index = "surfall", doc_type = "pubs", body = _query_occurrences)
        years = []
        entity_occurrences[entity] = _query_occurrences['hits']['total']
        
        for doc in _query_occurrences['hits']['hits']:
            years.append(doc['_source']['year'])
        if years:    
            if int(min(years)) > 2010:
                upcoming_occurrences[entity] = _query_occurrences['hits']['total']

    sorted_occurrences = sorted(entity_occurrences.items(), key=operator.itemgetter(1), reverse=True)
    sorted_upcoming = sorted(upcoming_occurrences.items(), key=operator.itemgetter(1), reverse=True)
    
    triples = []
    for pair in sorted_occurrences:
        amount = pair[1]
        term = pair[0]
        
        if terms_labels[term]['_source']['annotator'] in ['method', 'dataset']:
            actual_label = terms_labels[term]['_source']['annotator']
        elif terms_labels[term]['_source']['annotator'] in ['other', 'noise', 'software']:
            continue
        elif terms_labels[term]['_source']['mt_similarity'] > terms_labels[term]['_source']['ds_similarity']:
            actual_label = 'method'
        else:
            actual_label = 'dataset'
        triples.append([term, actual_label, amount])
    
    return sorted_occurrences, sorted_upcoming, terms_labels, triples

In [1083]:
query = 'ClueWeb09'
id_list = ['conf_trec_PamarthiZB09', 'conf_sigir_PotthastHSGMTW12', 'conf_trec_SmuckerCC09', 'conf_trec_LiXGGYLC12', 'conf_trec_PrasetyaTAM11', 'conf_trec_SerdyukovV09', 'conf_sigir_LeeC13', 'conf_trec_ZhengFW10', 'conf_trec_AlbakourKNF10']
# id_list = ['journals_ml_Gonzalez-CastanoGM04', 'conf_icdm_LuoHGR05', 'conf_icdm_DomeniconiG01', 'journals_ml_FungM05', 'journals_ml_GestelSBVVDMV04', 'conf_sigir_GreevyS04', 'conf_icdm_ChenWD02', 'journals_jmlr_ChenWYZ04', 'conf_icdm_CaiNHD11', 'conf_icarcv_NguyenVHM14', 'conf_icse_BrunE04', 'conf_icarcv_LiuW08', 'conf_ercimdl_BeelGSF10', 'conf_trec_LeeKA02', 'conf_icarcv_WongW04', 'journals_pvldb_GrosseLWFL11', 'conf_icarcv_YangHXL06']
# id_list = ['conf_sigir_VosoughiVR16', 'conf_icdm_YuanWX16', 'conf_icra_JainSKSS16', 'conf_icdm_LiuSDTFFYWLZ16', 'conf_icra_LubarskyWWBN16', 'conf_trec_Savenkov15', 'conf_sigir_YanSW16', 'conf_sigir_LeeALHL16', 'conf_icra_OtteWSZ16']
id_list = ['conf_icdm_LiZWZ11', 'conf_trec_HauptmannYQJCDCBLN02', 'conf_icra_BargotiU16', 'conf_incdm_HijaziCZ10', 'conf_www_MaekawaHN06', 'conf_icdm_ChenCHM13', 'conf_icarcv_LowM10', 'conf_icarcv_MoghadamWM10', 'conf_icdm_KannanTRK11']
id_list = ['conf_icwsm_HuttoG14', 'conf_icwsm_VargasMMO16', 'conf_icwsm_AhnGDM10', 'conf_www_HuTGL13', 'conf_esws_SaifFHA14', 'conf_icwsm_MahmudG14', 'conf_icwsm_Dang-XuanS12', 'conf_icwsm_HongS10', 'conf_icwsm_RieisSMPKA15']
id_list = ['conf_icra_OzawaC11', 'conf_icra_GarciaD03', 'conf_www_WangK15', 'conf_icra_RamanK14', 'journals_ml_OshersonSW92', 'conf_icra_HareGW15', 'conf_icarcv_Rios-BolivarAM06', 'conf_sigir_YangMXTL16', 'conf_icarcv_WakwellaAK04']


In [1087]:
popular, new, labels, triple_list = popular_upcoming_entities(id_list)
triple_list

[['sigir', 'dataset', 799],
 ['enron', 'dataset', 100],
 ['density function the', 'method', 36],
 ['nips', 'dataset', 34],
 ['stage ii ,', 'method', 28],
 ['cctv', 'dataset', 18],
 ['detector based on', 'method', 17],
 ['visual sensors with', 'method', 3],
 ['events ground truth', 'dataset', 1],
 ['social information fusion', 'method', 1],
 ['cmage', 'method', 1],
 ['concept detection to', 'method', 1]]

In [922]:
[term[0] for term in popular if labels[term[0]] in ['method', 'dataset']]

[]

In [947]:
[term[0] for term in new[0:5]]

['chatnoir',
 'example entities entity',
 'universitas indonesia trec',
 'golaxydt2 retrieval models']