In [1]:
import pickle
import pandas as pd
from math import log
import re
import nltk
nltk.download(['punkt','stopwords','wordnet','words'])
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import metapy

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cmejia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cmejia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cmejia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\cmejia\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


# Cargar el modelo de datos

In [2]:
loaded_model = pickle.load(open('estructuraDatos.sav', 'rb'))
idexFiles = loaded_model['idexFiles']
vectorizer = loaded_model['vectorizer']
matrix = loaded_model['matriz']
indexMeta = loaded_model['metapyIndex']

# Rankin solo por conteo sin indice invertido

In [3]:
# Encontrar los documentos que contiene una palabra en particular
def encontrarDoc(palabra):
    col = vectorizer.vocabulary_[palabra]
    matx = matrix[:,col]
    indx = matx.nonzero()[0]
    lista =indx.tolist() 
    dfresult = pd.DataFrame()
    for i in range(len(lista)):
        auxres= pd.DataFrame({'NombreArchivo': idexFiles[lista[i]], 'Frecuencia': [matx.data[i]]})
        dfresult = pd.concat([dfresult, auxres])
    dfresult.sort_values('Frecuencia',ascending = False,inplace = True)
    return dfresult

# Construcción del índice invertido

In [4]:
def indice_invertido(dic):
    inv = {}
    N = matrix.shape[0]
    for k, v in vectorizer.vocabulary_.items():
        inv.setdefault(k, {})
        #Los documentos que contienen la palabra v
        matx = matrix[:,v]
        #Indicador de los documentos que contienen la palabra
        indx = matx.nonzero()[0]
        lista =indx.tolist()
        docs = {}
        if len(lista)== 0:
            print(k)
        else:
            #Calculo del IDF, lista contiene todos los documentos que contienen la palabra
            inv[k]['IDF'] = log((N+1)/(len(lista)))
            for i in range(len(lista)):
                keys = docs.setdefault(idexFiles[lista[i]], [])
                #Frecuencia de la palabra V en el documento lista[i]
                keys.append(matx.data[i])
                #Las palabras que contiene el documento lista[i]
                matx2 = matrix[lista[i],:]
                #La frecuencia de cada palabra, que sumada el vector da el total de palabras en el documento
                keys.append(matx2.data.sum())
        inv[k]['Documentos'] = docs
    return inv

In [5]:
ind_inv = indice_invertido(vectorizer.vocabulary_)

across
all
almost
along
also
although
among
amongst
amount
and
anyhow
anyway
around
back
be
beforehand
behind
between
beyond
bill
both
bottom
call
can
cannot
cant
con
could
de
detail
do
eight
either
eleven
enough
even
except
fifteen
fill
find
fire
first
five
former
found
front
full
further
get
give
go
have
here
herein
how
in
interest
it
keep
last
latter
least
less
might
mine
move
must
name
neither
never
nevertheless
next
nor
nothing
off
often
one
onto
other
out
over
own
part
per
put
rather
same
see
seem
serious
show
side
somehow
still
system
take
ten
then
therein
these
thick
thin
third
though
three
throughout
thru
top
toward
two
under
upon
well
where
wherein
whereupon
whether
whoever
whole
whose
with
within
without
would


# Función de limpieza del query

In [6]:
stopWords = stopwords.words('english')
def queryClean(texto):
    #Pasar todo a minisculas
    texto = texto.lower()
    texto =re.sub('(á|à|ä)','a',texto) # Reemplazar a acentuada
    texto =re.sub('(é|è|ë)','e',texto) # Reemplazar e acentuada
    texto =re.sub('(í|ì|ï)','i',texto) # Reemplazar i acentuada
    texto =re.sub('(ó|ò|ö)','o',texto) # Reemplazar o acentuada
    texto =re.sub('(ú|ù|ü)','u',texto) # Reemplazar u acentuada
    texto =re.sub('[^a-zA-Z]',' ',texto) # Eliminar caracteres que no sean: letra, número o vocales acentuadas
    texto =re.sub(' +',' ',texto) # Eliminar espacios en blanco
    #Tokenizar
    tokens = texto.split()
    tokens = [w for w in tokens if (len(w)>1)&(w.isalpha())&(w not in stopWords)]
    #Lemma
    word_net_lemmatizar = WordNetLemmatizer()
    tokens = [word_net_lemmatizar.lemmatize(w, pos = "v") for w in tokens]

    #Stemmer
    ps = PorterStemmer() 
    tokens = [ps.stem(w) for w in tokens]

    return tokens

# Rankin por Term Frequency

In [7]:
def queryTF(word,top):
    respuesta = sorted(ind_inv[word]['Documentos'].items(), key = lambda kv:(kv[1], kv[0]),reverse=True)
    return respuesta[0:top]

# Rankin por Term Frequency / Doc Length

In [8]:
def queryTFDL(word,top):
    aux = ind_inv[word]['Documentos']
    auxdic = {}
    for k,v in aux.items():
        keys = auxdic.setdefault(k, [])
        keys.append(v[0]/v[1])
    respuesta = sorted(auxdic.items(), key = lambda kv:(kv[1], kv[0]),reverse=True)
    return respuesta[0:top]

# Rankin usando BM25

In [9]:
def cal_bm25(idf,frec,k,b,length,avgdl):
    aux = idf*((frec*(k+1))/(frec+k*(1-b+b*length/avgdl)))
    return aux

In [10]:
def queryBM25(query, vocabulary, prom, k1, b, top):
    query_word = queryClean(query)
    dfresultb25 = pd.DataFrame()
    resultadoBm25 = pd.DataFrame()
    for word in query_word:
        if (word in vocabulary):
            aux = ind_inv[word]['Documentos']
            IDF = ind_inv[word]['IDF']
            for k,v in aux.items():
            #     keys = bm25.setdefault(k, [])
                aux25 = cal_bm25(IDF,v[0],k1,b,v[1],prom)
                auxresb25= pd.DataFrame({'NombreArchivo': k.split('\\')[-1], 'Word': word, 'BM25' : [aux25]})
                dfresultb25 = pd.concat([dfresultb25, auxresb25])
            resultadoBm25 = dfresultb25.groupby('NombreArchivo').agg({'BM25':'sum'}).sort_values('BM25',ascending = False).reset_index()
            resultadoBm25.reset_index(inplace = True)
            resultadoBm25.rename(columns = {'index':'Ranking'}, inplace = True)
        else:
            print(f'{word} is not in the vocabulary')
    return resultadoBm25.head(top)

### Definición de parámetros para el BM25

In [11]:
top = 20
prom = 27544.226762002043
k1 = 1.2
b = 0.75
vocabulary = vectorizer.vocabulary_

# Metapy

In [12]:
inv_idx = metapy.index.make_inverted_index('cranfield.toml')

In [13]:
print(f'Total de documentos: {inv_idx.num_docs()}')
print(f'Cantidad de palabras únicas: {inv_idx.unique_terms()}')
print(f'Promedio de longitud de los documentos: {inv_idx.avg_doc_length()}')

Total de documentos: 980
Cantidad de palabras únicas: 51229
Promedio de longitud de los documentos: 3984.62646484375


# Metapy Rankin

In [14]:
def rankerMeta(top, querywords):
    ranker = metapy.index.OkapiBM25(k1 = k1, b = b)
    query = metapy.index.Document()
    query.content(querywords) # query from AP news
    top_docs = ranker.score(inv_idx, query, num_results=top)
    metaresult = pd.DataFrame()
    for doc in top_docs:
        auxmeta= pd.DataFrame({'NombreArchivo': indexMeta[doc[0]],  'BM25_Meta' : [doc[1]]})
        metaresult = pd.concat([metaresult, auxmeta])
    metaresult = metaresult.reset_index(drop = True).reset_index()
    metaresult.rename(columns = {'index':'RankingMeta'},inplace = True)
    return metaresult

# Evaluación de los queries

In [15]:
def calculate_sens(queries,top):
    sens = pd.DataFrame()
    for query in queries:
        resultados = queryBM25(query, vocabulary, prom, k1, b,top)
        metares = rankerMeta(top, query)
        merget = resultados.merge(metares, how = 'left', on = 'NombreArchivo')
        sensibilidad = (merget['RankingMeta']>=0).sum()/len(merget)
        auxsens= pd.DataFrame({'Query': query,  'Sensibilidad' : [sensibilidad]})
        sens = pd.concat([sens, auxsens])
    return sens

In [16]:
queries = ["Data Science","Machine Learning", "Math","Computer Science","Algorithms in dynamic networks", "triangle free process"]
sensibilidad = calculate_sens(queries,top)

data is not in the vocabulary


In [17]:
sensibilidad

Unnamed: 0,Query,Sensibilidad
0,Data Science,0.7
0,Machine Learning,0.95
0,Math,0.75
0,Computer Science,0.65
0,Algorithms in dynamic networks,0.85
0,triangle free process,0.75


In [44]:
query = "Algorithms in dynamic networks"

In [45]:
resultados = queryBM25(query, vocabulary, prom, k1, b,top)
metares = rankerMeta(top, query)

In [46]:
resultados.merge(metares, how = 'left', on = 'NombreArchivo')

Unnamed: 0,Ranking,NombreArchivo,BM25,RankingMeta,BM25_Meta
0,0,1412.4171.txt,4.843937,0.0,4.719563
1,1,1005.2894.txt,4.827652,1.0,4.667416
2,2,1511.02476.txt,4.806497,15.0,4.473639
3,3,1504.03957.txt,4.790875,4.0,4.589195
4,4,1204.1160.txt,4.782933,2.0,4.621671
5,5,1411.4097.txt,4.77674,6.0,4.559735
6,6,1502.04382.txt,4.76849,12.0,4.508224
7,7,1412.0291.txt,4.766188,16.0,4.472593
8,8,1502.02908.txt,4.76086,7.0,4.542059
9,9,1310.3389.txt,4.745049,3.0,4.603845


In [47]:
gruposbydoc = pd.read_csv('docByCluster.csv')

## Documentos cercanos

In [48]:
gruposbydoc[gruposbydoc['cluster']==cluster]

Unnamed: 0,identifier,title,description,subject,creator,combine_column,combine_cleaned,cuenta,cluster,name_file
8,http://arxiv.org/abs/0903.0197,Rotation Distance is Fixed-Parameter Tractable,Rotation distance between trees measures the...,Computer Science - Data Structures and Algorit...,"Cleary, Sean ; John, Katherine St. ;",Rotation Distance is Fixed-Parameter Tractable...,rotation distance fix parameter tractable rota...,48,4,0903.0197.txt
9,http://arxiv.org/abs/0903.0199,A Linear-Time Approximation Algorithm for Rota...,Rotation distance between rooted binary tree...,Computer Science - Data Structures and Algorit...,"Cleary, Sean ; John, Katherine St. ;",A Linear-Time Approximation Algorithm for Rota...,linear time approximation algorithm rotation d...,43,4,0903.0199.txt
17,http://arxiv.org/abs/0910.5577,On the stability of two-chunk file-sharing sys...,We consider five different peer-to-peer file...,Computer Science - Operating Systems ; Mathema...,"Norros, Ilkka ; Reittu, Hannu ; Eirola, Timo ;",On the stability of two-chunk file-sharing sys...,stability two chunk file share systems conside...,67,4,0910.5577.txt
34,http://arxiv.org/abs/1006.1029,Chi-square-based scoring function for categori...,Objectives: Text categorization has been use...,Computer Science - Information Retrieval ; Sta...,"Kastrin, Andrej ; Peterlin, Borut ; Hristovski...",Chi-square-based scoring function for categori...,chi square base score function categorization ...,166,4,1006.1029.txt
41,http://arxiv.org/abs/1012.4019,Constructing elliptic curve isogenies in quant...,Given two elliptic curves over a finite fiel...,Quantum Physics ; Computer Science - Computati...,"Childs, Andrew M. ; Jao, David ; Soukharev, Vl...",Constructing elliptic curve isogenies in quant...,construct elliptic curve isogenies quantum sub...,104,4,1012.4019.txt
43,http://arxiv.org/abs/1101.1169,Almost Settling the Hardness of Noncommutative...,"In this paper, we study the complexity of co...",Computer Science - Computational Complexity ;,"Chien, Steve ; Harsha, Prahladh ; Sinclair, Al...",Almost Settling the Hardness of Noncommutative...,almost settle hardness noncommutative determin...,145,4,1101.1169.txt
54,http://arxiv.org/abs/1104.0746,Quantifier Elimination over Finite Fields Usin...,We give an algebraic quantifier elimination ...,Computer Science - Symbolic Computation ; Comp...,"Gao, Sicun ; Platzer, André ; Clarke, Edmund M...",Quantifier Elimination over Finite Fields Usin...,quantifier elimination finite field use gr obn...,44,4,1104.0746.txt
57,http://arxiv.org/abs/1104.4987,An improved bound on the number of point-surfa...,We show that $m$ points and $n$ smooth algeb...,Mathematics - Combinatorics ; Computer Science...,"Zahl, Joshua ;",An improved bound on the number of point-surfa...,improve bind number point surface incidences t...,105,4,1104.4987.txt
61,http://arxiv.org/abs/1106.1445,From Classical to Quantum Shannon Theory,"The aim of this book is to develop ""from the...",Quantum Physics ; Computer Science - Informati...,"Wilde, Mark M. ;",From Classical to Quantum Shannon Theory The ...,classical quantum shannon theory aim book deve...,72,4,1106.1445.txt
67,http://arxiv.org/abs/1108.1915,Noise effects in the quantum search algorithm ...,We analyse the resilience of the quantum sea...,Quantum Physics ; Computer Science - Computati...,"Gawron, Piotr ; Klamka, Jerzy ; Winiarczyk, Ry...",Noise effects in the quantum search algorithm ...,noise effect quantum search algorithm computat...,42,4,1108.1915.txt


## Grupos de documentos

In [49]:
gruposdocsA = pd.read_csv('clustertable_articles.csv')
gruposdocsA['name_file'] = gruposdocsA['identifier'].apply(lambda x: x.split('/')[-1]+'.txt')
gruposdocsA.rename(columns = {'prediction':'clusterA'}, inplace = True)
gruposdocsA_merge = gruposdocsA[['name_file','clusterA']].copy()
del gruposdocsA

In [50]:
ldaA = pd.read_csv('ldaresults_articles.csv')
ldaA['name_file'] = ldaA['identifier'].apply(lambda x: x.split('/')[-1]+'.txt')
ldaA_merge = ldaA[['name_file','mainTopic']].copy()
del ldaA

In [51]:
totalDocs = gruposbydoc.merge(gruposdocsA_merge, how = 'outer',on = 'name_file')
totalDocs = totalDocs.merge(ldaA_merge, how = 'outer',on = 'name_file')
totalDocs

Unnamed: 0,identifier,title,description,subject,creator,combine_column,combine_cleaned,cuenta,cluster,name_file,clusterA,mainTopic
0,http://arxiv.org/abs/0704.3504,Smooth R\'enyi Entropy of Ergodic Quantum Info...,We prove that the average smooth Renyi entro...,Quantum Physics ; Computer Science - Informati...,"Schoenmakers, Berry ; Tjoelker, Jilles ; Tuyls...",Smooth R\'enyi Entropy of Ergodic Quantum Info...,smooth enyi entropy ergodic quantum informatio...,34,11,0704.3504.txt,1,10
1,http://arxiv.org/abs/0706.1402,Analyzing Design Process and Experiments on th...,"In the field of tutoring systems, investigat...",Computer Science - Computers and Society ; Com...,"Brust, Matthias R. ; Rothkugel, Steffen ;",Analyzing Design Process and Experiments on th...,analyze design process experiment anita generi...,100,10,0706.1402.txt,1,6
2,http://arxiv.org/abs/0710.0736,Colour image segmentation by the vector-valued...,We propose a new method for the numerical so...,Computer Science - Computer Vision and Pattern...,"Kay, David A ; Tomasi, Alessandro ;",Colour image segmentation by the vector-valued...,colour image segmentation vector value allen c...,69,11,0710.0736.txt,1,1
3,http://arxiv.org/abs/0803.2570,Unequal Error Protection: An Information Theor...,An information theoretic framework for unequ...,Computer Science - Information Theory ; Comput...,"Borade, Shashi ; Nakiboglu, Baris ; Zheng, Liz...",Unequal Error Protection: An Information Theor...,unequal error protection information theoretic...,75,5,0803.2570.txt,1,19
4,http://arxiv.org/abs/0808.0084,On the hitting times of quantum versus random ...,In this paper we define new Monte Carlo type...,Quantum Physics ; Computer Science - Data Stru...,"Magniez, Frederic ; Nayak, Ashwin ; Richter, P...",On the hitting times of quantum versus random ...,hit time quantum versus random walk paper defi...,120,11,0808.0084.txt,1,8
5,http://arxiv.org/abs/0811.1254,Coding Theory and Algebraic Combinatorics,This chapter introduces and elaborates on th...,Mathematics - Combinatorics ; Computer Science...,"Huber, Michael ;",Coding Theory and Algebraic Combinatorics Thi...,cod theory algebraic combinatorics chapter int...,62,5,0811.1254.txt,1,17
6,http://arxiv.org/abs/0811.2853,Generating Random Networks Without Short Cycles,Random graph generation is an important tool...,Computer Science - Data Structures and Algorit...,"Bayati, Mohsen ; Montanari, Andrea ; Saberi, A...",Generating Random Networks Without Short Cycle...,generate random network without short cycle ra...,107,3,0811.2853.txt,1,16
7,http://arxiv.org/abs/0812.2709,Variations on a theme by Schalkwijk and Kailath,Schalkwijk and Kailath (1966) developed a cl...,Computer Science - Information Theory ;,"Gallager, Robert G. ; Nakiboglu, Baris ;",Variations on a theme by Schalkwijk and Kailat...,variations theme schalkwijk kailath schalkwijk...,99,5,0812.2709.txt,1,10
8,http://arxiv.org/abs/0903.0197,Rotation Distance is Fixed-Parameter Tractable,Rotation distance between trees measures the...,Computer Science - Data Structures and Algorit...,"Cleary, Sean ; John, Katherine St. ;",Rotation Distance is Fixed-Parameter Tractable...,rotation distance fix parameter tractable rota...,48,4,0903.0197.txt,1,17
9,http://arxiv.org/abs/0903.0199,A Linear-Time Approximation Algorithm for Rota...,Rotation distance between rooted binary tree...,Computer Science - Data Structures and Algorit...,"Cleary, Sean ; John, Katherine St. ;",A Linear-Time Approximation Algorithm for Rota...,linear time approximation algorithm rotation d...,43,4,0903.0199.txt,1,17


In [28]:
totalDocs.to_csv('resultados3entrega.csv', index = False, decimal = '.')

## Cuál documento quiere explorar más?
Decir el ranking

In [52]:
documento = 0

In [53]:
cluster = totalDocs[totalDocs['name_file']==resultados['NombreArchivo'].iloc[documento]]['cluster'].values[0]
topic = totalDocs[totalDocs['name_file']==resultados['NombreArchivo'].iloc[documento]]['mainTopic'].values[0]

# Documentos que hacen parte del mismo clúster y al mismo tópico

In [61]:
totalDocs['File_Name'] = totalDocs.apply(lambda x: x['name_file']+ '-' + x['title'], axis = 1)

In [62]:
totalDocs[(totalDocs['cluster']==cluster)&(totalDocs['mainTopic']==topic)]['File_Name'].values

array(['1409.8580.txt-Interference Functionals in Poisson Networks',
       '1412.4171.txt-Dynamics of Information Diffusion and Social Sensing',
       '1502.04382.txt-Temporal Network Optimization Subject to Connectivity Constraints',
       '1510.04249.txt-Random Irregular Block-hierarchical Networks: Algorithms for Computation   of Main Properties'],
      dtype=object)

# Documentos que hacen parte del mismo clúster

In [63]:
totalDocs[(totalDocs['cluster']==cluster)]['File_Name'].values

array(['0911.2538.txt-Euclidean versus hyperbolic congestion in idealized versus experimental   networks',
       '1001.1435.txt-JBotSim, a Tool for Fast Prototyping of Distributed Algorithms in   Dynamic Networks',
       '1002.0747.txt-Efficient Bayesian Learning in Social Networks with Gaussian Estimators',
       '1005.2894.txt-Optimal Gradient Clock Synchronization in Dynamic Networks',
       '1111.3048.txt-On a Connection Between Small Set Expansions and Modularity Clustering   in Social Networks',
       '1204.1160.txt-Opinion formation in time-varying social networks: The case of the   naming game',
       '1209.5527.txt-Strategic Learning and the Topology of Social Networks',
       '1210.7341.txt-Subset Codes for Packet Networks',
       '1301.2959.txt-New elements for a network (including brain) general theory during   learning period',
       '1301.3605.txt-Feature Learning in Deep Neural Networks - Studies on Speech Recognition   Tasks',
       '1301.5522.txt-On Gaussian 

# Documentos que pertenecen al mismo tópico

In [64]:
totalDocs[(totalDocs['mainTopic']==topic)]['File_Name'].values

array(['0808.0084.txt-On the hitting times of quantum versus random walks',
       '0907.3220.txt-Inter Genre Similarity Modelling For Automatic Music Genre   Classification',
       '1109.2162.txt-The Complexity of the Empire Colouring Problem',
       '1201.0490.txt-Scikit-learn: Machine Learning in Python',
       '1204.0480.txt-Deducing Security Goals From Shape Analysis Sentences',
       '1204.2727.txt-The Cost of Perfection for Matchings in Graphs',
       '1206.3862.txt-Total coloring of 1-toroidal graphs of maximum degree at least 11 and no   adjacent triangles',
       '1212.1095.txt-The projector algorithm: a simple parallel algorithm for computing   Voronoi diagrams and Delaunay graphs',
       "1212.6751.txt-Computably Categorical Fields via Fermat's Last Theorem",
       '1301.1027.txt-On online energy harvesting in multiple access communication systems',
       '1306.1595.txt-Layered Separators in Minor-Closed Graph Classes with Applications',
       '1308.4201.txt-Full-