# Initialization

First, let's initialize the notebook

In [1]:
import os
import re
import math
import nltk

from nltk.tokenize import RegexpTokenizer
from nltk import bigrams
from nltk import trigrams

nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('spanish')
tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE)

folder = '/home/rr/Desktop/ML/BORME/borme/dias/test/'

[nltk_data] Downloading package stopwords to /home/rr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Define helper functions

In [2]:
#Compute the frequency for each term.
def freq(word, tokens):
    return tokens.count(word)
#Count the number of words in a document
def word_count(doc):
    return len(doc)

#Compute the frequency of the term in the document
def tf(word, doc):
    return (freq(word, doc) / float(word_count(doc)))

#Compute the number of documents where a term is found
def num_docs_containing(word, list_of_docs):
    count = 0
    for document in list_of_docs:
        if freq(word, document) > 0:
            count += 1
    return 1 + count

#Compute the inverse document frequency
def idf(word, list_of_docs):
    return math.log(len(list_of_docs) /
            float(num_docs_containing(word, list_of_docs)))


def tf_idf(word, doc, list_of_docs):
    return (tf(word, doc) * idf(word, list_of_docs))

# Create a vocabulary of tokens from the list of documents and model the Vector Space

The code below tokenizes each document in the corpus and computes the term frequencies.
Once each of the documents in the corpus has been tokenized, the next step is to compute the document frequency quantity, that is, for each term, how many documents that term appears in.
This can be represented as a very sparse matrix where the rows are each sentence and the columns each term, giving place to a vector space of zeroes and ones.
Before going to IDF, it is important to normalize the term-frequencies to avoid bias torwards long documents, or excessively repeated terms.
The VSM is an algebraic model representing the importance of a term (tf-idf) or even the absence or presence (Bag of Words) of it in a document. 

In [3]:
vocabulary = []
docs = {}
all_tips = []


dirs=os.listdir(folder)
dic={}
print('Processing',len(dirs),'files')
for fname in dirs:
    wordcounts={}
    dic[fname]=wordcounts
    if fname=='.DS_Store':
        continue
    text=open(folder+'/'+ fname,'r')
    for line in text:
        line=line.split('\n')
        line=line[0]

        tokens = tokenizer.tokenize(line)

        bi_tokens = bigrams(tokens)
        tri_tokens = trigrams(tokens)
        tokens = [token.lower() for token in tokens if len(token) > 2]
        tokens = [token for token in tokens if token not in stopwords]

        bi_tokens = [' '.join(token).lower() for token in bi_tokens]
        bi_tokens = [token for token in bi_tokens if token not in stopwords]

        tri_tokens = [' '.join(token).lower() for token in tri_tokens]
        tri_tokens = [token for token in tri_tokens if token not in stopwords]

        final_tokens = []
        final_tokens.extend(tokens)
        final_tokens.extend(bi_tokens)
        final_tokens.extend(tri_tokens)
        docs[line] = {'freq': {}, 'tf': {}, 'idf': {}, 'tf-idf': {}, 'tokens': []}

        for token in final_tokens:
            #The frequency computed for each tip
            docs[line]['freq'][token] = freq(token, final_tokens)
            #The term-frequency (Normalized Frequency)
            docs[line]['tf'][token] = tf(token, final_tokens)
            docs[line]['tokens'] = final_tokens

        vocabulary.append(final_tokens)
print('Vocabulary created,', len(vocabulary),'tokens included and normalized')


Processing 5 files
Vocabulary created, 1405 tokens included and normalized


# Compute the TF-IDF

The TF-IDF is the product between the TF and IDF. So a high weight of the tf-idf is reached when you have a high term frequency (tf) in the given document and low document frequency of the term in the whole collection.


In [4]:
for doc in docs:
    for token in docs[doc]['tf']:
        #The Inverse-Document-Frequency
        docs[doc]['idf'][token] = idf(token, vocabulary)
        #The tf-idf
        docs[doc]['tf-idf'][token] = tf_idf(token, docs[doc]['tokens'], vocabulary)
print('TF-IDF computed')

TF-IDF computed


# Output the most relevant words


In [5]:


#Now let's find out the most relevant words by tf-idf.
words = {}
for doc in docs:
    for token in docs[doc]['tf-idf']:
        if token not in words:
            words[token] = docs[doc]['tf-idf'][token]
        else:
            if docs[doc]['tf-idf'][token] > words[token]:
                words[token] = docs[doc]['tf-idf'][token]
print('Displaying results')
for item in sorted(words.items(), key=lambda x: x[1], reverse=True):
    print ("%f <= %s" % (item[1], item[0]))


Displaying results
6.554645 <= castellón
5.861498 <= badajoz
5.638355 <= albacete
5.638355 <= alicante
5.456033 <= empresarios
3.277323 <= pág 7
3.277323 <= pág 5
3.277323 <= pág 25
3.277323 <= pág 4
3.277323 <= pág 8
3.277323 <= pág 23
3.277323 <= pág 1
3.277323 <= pág 22
3.277323 <= pág 18
3.277323 <= pág 15
3.277323 <= pág 9
3.277323 <= pág 10
3.277323 <= pág 12
3.277323 <= pág 19
3.277323 <= pág 20
3.277323 <= pág 2
3.277323 <= pág 3
3.277323 <= pág 21
3.277323 <= pág 11
3.277323 <= pág 16
3.277323 <= pág 13
3.277323 <= pág 17
3.277323 <= pág 6
3.277323 <= pág 14
3.277323 <= pág 24
1.994848 <= núm 1
1.994848 <= pág
1.994848 <= núm
1.818678 <= actos inscritos
1.818678 <= sección primera
1.818678 <= inscritos
1.818678 <= actos
1.767294 <= sección
1.767294 <= primera
1.767294 <= ciudad real
1.722784 <= ciudad
1.648402 <= real
1.638661 <= 82 pascampo sl
1.638661 <= 51 castrus
1.638661 <= 29 arfriotex
1.638661 <= 72 isomedia sl
1.638661 <= 29 arfriotex sl
1.638661 <= pirosellers sl
1.63

0.595877 <= threeline
0.595877 <= rodafuerte sociedad
0.595877 <= 304 lbt
0.595877 <= 335 vg
0.595877 <= maxmetal cr sociedad
0.595877 <= 358 jose ramos
0.595877 <= butech building
0.595877 <= technology sociedad
0.595877 <= classic
0.595877 <= consulting sl
0.595877 <= 393 casa nova
0.595877 <= 432 promocions ceramiques
0.595877 <= micronizados rodafuerte sociedad
0.595877 <= rodalba sociedad limitada
0.595877 <= ecoplan
0.595877 <= hoteleras segorbe
0.595877 <= 432 promocions
0.595877 <= 414
0.595877 <= 71 the place
0.595877 <= 331
0.595877 <= 14 grupo zasan
0.595877 <= 369 ocio factory
0.595877 <= 340 inversiones
0.595877 <= butech
0.595877 <= butech building technology
0.595877 <= 340
0.595877 <= vg circulante sociedad
0.595877 <= segorbe sl
0.595877 <= 71 the
0.595877 <= 1 micronizados rodafuerte
0.595877 <= garijo sl
0.595877 <= promocions ceramiques
0.595877 <= factory
0.595877 <= 393
0.595877 <= chuculandia 2017
0.595877 <= lormor
0.595877 <= profesional consulting
0.595877 <= 

0.546220 <= urbanos
0.546220 <= 29184 i a
0.546220 <= hosteleros sociedad limitada
0.546220 <= cortes sociedad
0.546220 <= 13 lavanderia
0.546220 <= 32706
0.546220 <= piensos s
0.546220 <= aprosuba 13 lavanderia
0.546220 <= 345 promokuatro
0.546220 <= belleza candela sl
0.546220 <= 362
0.546220 <= orpacer ceramicas
0.546220 <= sacris
0.546220 <= trading sociedad limitada
0.546220 <= 487 almacenes hermacon
0.546220 <= porcelanosa
0.546220 <= 367 spanish spot
0.546220 <= 2009 sociedad limitada
0.546220 <= 157884 i
0.546220 <= 2009 sociedad
0.546220 <= pueyo sociedad
0.546220 <= ifjm hosteleros
0.546220 <= sevicrip sociedad limitada
0.546220 <= elche sa
0.546220 <= crealia
0.546220 <= 125 banca
0.546220 <= sevicrip
0.546220 <= sertec sociedad
0.546220 <= 366 orpacer
0.546220 <= 157889 i
0.546220 <= planasat
0.546220 <= ceramicas sociedad limitada
0.546220 <= 403 galadtrans
0.546220 <= bulevar sociedad limitada
0.546220 <= a i e
0.546220 <= sercas consulting sociedad
0.546220 <= xiquets de

0.259825 <= consuegra
0.259825 <= martin consuegra
0.259003 <= 2013
0.257045 <= un negocio dedicado
0.257045 <= dedicado a la
0.257045 <= negocio dedicado a
0.257045 <= dedicado
0.257045 <= negocio dedicado
0.257045 <= dedicado a
0.256289 <= 1 19
0.256289 <= a 1 19
0.256289 <= 1 19 12
0.254848 <= vall d
0.254848 <= 102 s 8
0.254848 <= 102 s
0.254848 <= 187 s 8
0.254848 <= uixo
0.254848 <= 141
0.254848 <= 365
0.254848 <= vall d uixo
0.254848 <= d uixo
0.254848 <= 102
0.252528 <= maquinaria
0.252102 <= consejero huerta tomas
0.252102 <= nombramientos consejero huerta
0.252102 <= escrihuela jose
0.252102 <= sl nombramientos consejero
0.252102 <= escrihuela escrihuela jose
0.252102 <= hujoceramic
0.252102 <= salvador escrihuela
0.252102 <= escrihuela escrihuela
0.252102 <= tomas salvador escrihuela
0.252102 <= huerta tomas
0.252102 <= huerta tomas salvador
0.252102 <= hujoceramic sl
0.252102 <= hujoceramic sl nombramientos
0.252102 <= consejero huerta
0.252102 <= salvador escrihuela escrih

0.212041 <= 97 s 8
0.212041 <= f 44 s
0.212041 <= f 97 s
0.212041 <= f 97
0.212041 <= 44 s
0.212041 <= 97 s
0.212041 <= f 44
0.211440 <= unipersonalidad ceses
0.211440 <= adelin
0.211440 <= unico condrache condrache
0.211440 <= adm unico condrache
0.211440 <= condrache condrache adelin
0.211440 <= unico condrache
0.211440 <= condrache adelin
0.211440 <= condrache condrache
0.211440 <= unipersonalidad ceses dimisiones
0.211440 <= de unipersonalidad ceses
0.210218 <= al por
0.210114 <= liquidador
0.209847 <= cualquier
0.209348 <= modificado
0.209339 <= f 154
0.209339 <= 154 s 8
0.209339 <= 154 s
0.209339 <= 676 f
0.209339 <= 150
0.209339 <= 676
0.209339 <= 154
0.209339 <= f 154 s
0.208828 <= 463
0.208084 <= cultivo
0.207221 <= de los
0.205667 <= la venta de
0.205329 <= juan
0.204973 <= unico casanova
0.204973 <= i a 25
0.204973 <= francisco nombramientos
0.204973 <= adm unico casanova
0.204973 <= monfort
0.204973 <= liquidador samper
0.204973 <= alejandro datos
0.204973 <= a 25
0.204973 

0.172491 <= angel canos broch
0.172491 <= cs 40507
0.172491 <= milena datos
0.172491 <= david ceses dimisiones
0.172491 <= molina beatriz nombramientos
0.172491 <= solid altabas baca
0.172491 <= f 74 s
0.172491 <= unico carmen
0.172491 <= 40507 i
0.172491 <= servicios asistenciales médicos
0.172491 <= milena datos registrales
0.172491 <= aurora ceses dimisiones
0.172491 <= electrodomésticos televisores
0.172491 <= 1167
0.172491 <= unico tapasco garcia
0.172491 <= objeto social campings
0.172491 <= mantenimiento y reparación
0.172491 <= l 1167
0.172491 <= las naciones 25
0.172491 <= supl aguilella
0.172491 <= javier consejero simon
0.172491 <= sanitarios de enfermería
0.172491 <= cueva santa
0.172491 <= las naciones
0.172491 <= por supresión
0.172491 <= nombramientos liquidador m
0.172491 <= duran romero
0.172491 <= sol martinez
0.172491 <= t 1300
0.172491 <= campings y
0.172491 <= a 169
0.172491 <= 863
0.172491 <= modificado por supresión
0.172491 <= socio único roubi
0.172491 <= solid

0.170811 <= dols alonso daniel
0.170811 <= esther dols alonso
0.170811 <= alcaraz rochera susana
0.170811 <= alonso daniel
0.170811 <= ros
0.170811 <= cueva
0.170811 <= colomer puig
0.170811 <= jose rojo rojo
0.170811 <= colomer puig emilio
0.170811 <= muñoz fernandez
0.170811 <= unico pla
0.170811 <= colomer
0.170811 <= gonzalez jose rojo
0.170811 <= alventosa
0.170811 <= teruel galindo oscar
0.170811 <= adm unico navarro
0.170811 <= unico garcia
0.170811 <= mercantiles
0.170811 <= daniel nombramientos
0.170811 <= albalat
0.170811 <= de viviendas
0.170811 <= grangel marcial
0.170811 <= josep
0.170811 <= rojo rojo
0.170811 <= galindo oscar
0.170811 <= martin gimeno maria
0.170811 <= garcia ruiz
0.170811 <= alventosa federico
0.170811 <= rincon
0.170811 <= ibañez hernan
0.170811 <= adm unico garcia
0.170811 <= albalat jose colomer
0.170811 <= federico
0.170811 <= jose rojo
0.170811 <= pitarch gil
0.170811 <= hernan alcaraz
0.170811 <= gimeno maria cueva
0.170811 <= adm solid mora
0.1708

0.156063 <= y reforma
0.156063 <= san jose maria
0.156063 <= despacho 2 2º
0.156063 <= restaurante y cafetería
0.156063 <= de pintura
0.156063 <= operaciones 31 10
0.156063 <= aire acondicionado
0.156063 <= reparación maquinaria
0.156063 <= beltran alfonso
0.156063 <= reparaciones
0.156063 <= t 1546 l
0.156063 <= mitjana 41 nules
0.156063 <= revestimiento
0.156063 <= apo sol castello
0.156063 <= revestimiento de pintura
0.156063 <= social plaza cardona
0.156063 <= nabli kaisser nombramientos
0.156063 <= 41 nules
0.156063 <= burriana capital 3
0.156063 <= pintura proyectada
0.156063 <= unico iborra
0.156063 <= 29 4º castellon
0.156063 <= c sabadell 17
0.156063 <= luis suarez diaz
0.156063 <= 718 f 95
0.156063 <= 1 construcción
0.156063 <= ningel maria antonie
0.156063 <= cualquier artículo eléctrico
0.156063 <= guerola antonio javier
0.156063 <= t 210 f
0.156063 <= f 108
0.156063 <= de aire
0.156063 <= proyectada
0.156063 <= garcia peris
0.156063 <= construcción otra
0.156063 <= marti 4

0.139461 <= 560 000
0.139461 <= 155364
0.139461 <= cs 40515
0.139461 <= concurso fortuito
0.139461 <= equipos electrónicos
0.139461 <= 1º la explotación
0.139461 <= a 143815 i
0.139461 <= nutricionales y
0.139461 <= suscrito 566 000
0.139461 <= nutricionales
0.139461 <= de equipos electrónicos
0.139461 <= a empresa
0.139461 <= 195
0.139461 <= de calificación juzgado
0.139461 <= b villarreal
0.139461 <= 4040 f 37
0.139461 <= 219 00
0.139461 <= 545 500 00
0.139461 <= cerdan
0.139461 <= f 195 s
0.139461 <= a 143815
0.139461 <= 219 00 euros
0.139461 <= particulares
0.139461 <= pub y restaurante
0.139461 <= t 950
0.139461 <= 566 000 00
0.139461 <= c obispo san
0.139461 <= 585 f
0.139461 <= adm unico argudo
0.139461 <= 566 000
0.139461 <= t 3839
0.139461 <= registrales t 1941
0.139461 <= suscrito 18 000
0.139461 <= y restaurante y
0.139461 <= domicilio c obispo
0.139461 <= 143815 i a
0.139461 <= i a 29
0.139461 <= particulares y
0.139461 <= 3839 f
0.139461 <= 1941 f 163
0.139461 <= b villarr

0.126051 <= erratas en el
0.126051 <= 485 938
0.126051 <= l 1137
0.126051 <= sociedad el
0.126051 <= 461 45
0.126051 <= f 42 s
0.126051 <= y compañia sl
0.126051 <= 33575
0.126051 <= del año 2004
0.126051 <= 107 000
0.126051 <= material eléctrico
0.126051 <= auditor moore
0.126051 <= t 1575
0.126051 <= salvador secretario
0.126051 <= 40518 i a
0.126051 <= 04 2013 auto
0.126051 <= venta y alquiler
0.126051 <= social solados y
0.126051 <= desde su
0.126051 <= 006 00
0.126051 <= registrales t 1206
0.126051 <= ibergrup sap datos
0.126051 <= de elementos y
0.126051 <= urb les
0.126051 <= único cosura
0.126051 <= l 951 f
0.126051 <= 1299 l 862
0.126051 <= cs 33575 i
0.126051 <= inscripción 8ª
0.126051 <= t 2680
0.126051 <= taller de mecánica
0.126051 <= suscrito 355
0.126051 <= a 64
0.126051 <= procedimiento concursal 149
0.126051 <= l 1137 f
0.126051 <= 1137 f 221
0.126051 <= maquinaria nueva y
0.126051 <= cosura 2007
0.126051 <= cual desde su
0.126051 <= y herramientas para
0.126051 <= t 1

0.111096 <= 060 000 00
0.111096 <= capital 650
0.111096 <= y harinas cnaes
0.111096 <= 9319
0.111096 <= 421 421
0.111096 <= envasado y ventas
0.111096 <= 735 383
0.111096 <= 0161 9319
0.111096 <= 060 000
0.111096 <= ventas de
0.111096 <= 505 000
0.111096 <= 735
0.111096 <= harinas
0.111096 <= suscrito 735 383
0.111096 <= 1102 y
0.111096 <= 60 euros datos
0.111096 <= capital 421 421
0.111096 <= 001
0.111096 <= y ventas
0.111096 <= 20 euros resultante
0.111096 <= 0161
0.111096 <= 39866 i
0.111096 <= 0150 0161 9319
0.111096 <= c obra pia
0.111096 <= t 1728
0.111096 <= aceites y
0.111096 <= 4078
0.111096 <= cnaes 0150
0.111096 <= capital 505 000
0.111096 <= 19 s
0.111096 <= 9319 4631
0.111096 <= 60 euros
0.111096 <= 001 00
0.111096 <= pia 4 bajo
0.111096 <= t 4078 f
0.111096 <= capital capital 505
0.111096 <= 2266 f
0.111096 <= de vino
0.111096 <= 9319 4631 1102
0.111096 <= 1728
0.111096 <= vino aceites
0.111096 <= l 1289 f
0.111096 <= 1728 l
0.111096 <= 421 421 20
0.111096 <= 505 000 00
0