In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pitta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pitta\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pitta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
from itertools import zip_longest
from nltk.stem import PorterStemmer, LancasterStemmer
import os
import re

In [3]:
CISI_FILE = '../cisi/CISI.ALL'
MAX_LONG = 20
MIN_LONG = 3
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

In [4]:
def useStemmer(stemmer, content):
    if stemmer == 'porter':
        return [porter_stemmer.stem(word) for word in content]
    if stemmer == 'lancaster':
        return [lancaster_stemmer.stem(word) for word in content]
    return content

In [5]:
def removeStopWords(words_list, stop_words_list):
    cleaned_text = [word for word in words_list if word not in stop_words_list]
    return cleaned_text

In [6]:
def findAbbreviations(content):
    REGEX_ABBREVIATIONS = r'\b[a-zA-Z][bcdfgh-np-tvxz]+(?![A-Z])\.'
    return re.findall(REGEX_ABBREVIATIONS, content)

In [7]:
def findEmailsAndUrls(content):
    REGEX_EMAILS = r'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
    REGEX_URLS = r'(https?://)?(www\.[a-z0-9]+(?:\.[a-z0-9]+)+)'
    list_emails = re.findall(REGEX_EMAILS, content)
    list_urls = [''.join(url_tuple) for url_tuple in re.findall(REGEX_URLS, content)]
    return list_emails + list_urls

In [8]:
def findNumbersAndPhones(content):
    REGEX_NUMBERS = r' (\d+)'
    REGEX_PHONES = r'\+?\d{6,}|\+?\d{8,}|\d{2,}-\d{5,}|\+?\d+-\d{2,}-\d{5,}'
    list_numbers = re.findall(REGEX_NUMBERS, content)
    list_phones = re.findall(REGEX_PHONES, content)
    return list_phones + list_numbers

In [9]:
def findWords(content):
    REGEX_WORDS = f'[A-zÀ-ú]{{{MIN_LONG},{MAX_LONG}}}'
    return re.findall(REGEX_WORDS, content)

In [10]:
def tokenizer(content, stop_words_list = None):
    abbreviations_list = findAbbreviations(content)
    emailsAndUrls_list = findEmailsAndUrls(content)
    numbersAndPhones_list = findNumbersAndPhones(content)
    words_list = findWords(content)
    content = abbreviations_list + emailsAndUrls_list + numbersAndPhones_list + words_list
    if stop_words_list != None:
        content = removeStopWords(content, stop_words_list)
    return content

In [11]:
def process_file(file: str, stemmer):
    content = ''
    with open(os.path.abspath(file), 'r', encoding='utf8') as f:
        content += f.read()
    terms = useStemmer(stemmer, tokenizer(content))
    unique_words = set(terms)
    counter_unique_tokens = 0
    for word in unique_words:
        counter_unique_tokens += 1 if terms.count(word) == 1 else 0 
    return counter_unique_tokens, unique_words


In [12]:
counter_unique_tokens_porter, unique_words_porter = process_file(CISI_FILE, 'porter')

In [13]:
counter_unique_tokens_lancaster, unique_words_lancaster = process_file(CISI_FILE, 'lancaster')

In [14]:
print(f'Tokens unicos Porter: {counter_unique_tokens_porter}')
print(f'Tokens unicos Lancaster: {counter_unique_tokens_lancaster}')

Tokens unicos Porter: 4157
Tokens unicos Lancaster: 3677


Podemos deducir que el algoritmo de Lancaster es mejor tanto en tiempo de ejecución (8.6s vs 7.7s) como en tokens únicos detectados (4157 vs 3677).

In [15]:
for word_porter, word_lancaster in zip_longest(unique_words_porter, unique_words_lancaster, fillvalue=''):
    print(word_porter, word_lancaster)

1938 discovery
cartridg 1938
codebook cartridg
155 codebook
depth sem
accept 155
viniti affy
clark clark
981 leis
grassland 981
interpoint grassland
enquiri chernay
recoveri interpoint
accademia 769
769 191
191 bear
servic byrd
frustrat fail
bear 1054
byrd quick
tribe 752
rosen radio
denizen mamontov
fail 866
1054 accredit
quick 1089
752 unhurry
curriculum bios
radio printout
mamontov proofslip
866 briskm
accredit quas
1089 neurolog
printout rout
proofslip 806
valuabl decen
academia fores
neurolog 640
rout everyday
dilat 1053
806 scheme
dori repot
lubetzki stearn
640 delib
everyday ree
1053 protract
scheme cosmopolit
repot outlin
stearn divorc
ree perifus
picatinni viabl
protract quiet
outlin recognit
varieti reput
divorc 246
perifus 1069
viabl jolley
recognit hdb
reput closest
246 122
1069 screen
jolley 1022
hdb biolog
screen subsequ
122 gain
closest idc
mackenzi 314
de psycholingu
shortag shipbuild
1022 misinterpret
biolog cobol
readili 1083
subsequ franz
gain encod
spectral througho