In [38]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pitta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pitta\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pitta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [39]:
from itertools import zip_longest
from nltk.stem import PorterStemmer, LancasterStemmer
import os
import re

In [40]:
CISI_FILE = '../cisi/CISI.ALL'
MAX_LONG = 20
MIN_LONG = 3
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

In [41]:
def useStemmer(stemmer, content):
    if stemmer == 'porter':
        return [porter_stemmer.stem(word) for word in content]
    if stemmer == 'lancaster':
        return [lancaster_stemmer.stem(word) for word in content]
    return content

In [42]:
def removeStopWords(words_list, stop_words_list):
    cleaned_text = [word for word in words_list if word not in stop_words_list]
    return cleaned_text

In [43]:
def findAbbreviations(content):
    REGEX_ABBREVIATIONS = r'\b[a-zA-Z][bcdfgh-np-tvxz]+(?![A-Z])\.'
    return re.findall(REGEX_ABBREVIATIONS, content)

In [44]:
def findEmailsAndUrls(content):
    REGEX_EMAILS = r'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
    REGEX_URLS = r'(https?://)?(www\.[a-z0-9]+(?:\.[a-z0-9]+)+)'
    list_emails = re.findall(REGEX_EMAILS, content)
    list_urls = [''.join(url_tuple) for url_tuple in re.findall(REGEX_URLS, content)]
    return list_emails + list_urls

In [45]:
def findNumbersAndPhones(content):
    REGEX_NUMBERS = r' (\d+)'
    REGEX_PHONES = r'\+?\d{6,}|\+?\d{8,}|\d{2,}-\d{5,}|\+?\d+-\d{2,}-\d{5,}'
    list_numbers = re.findall(REGEX_NUMBERS, content)
    list_phones = re.findall(REGEX_PHONES, content)
    return list_phones + list_numbers

In [46]:
def findWords(content):
    REGEX_WORDS = f'[A-zÀ-ú]{{{MIN_LONG},{MAX_LONG}}}'
    return re.findall(REGEX_WORDS, content)

In [47]:
def tokenizer(content, stop_words_list = None):
    abbreviations_list = findAbbreviations(content)
    emailsAndUrls_list = findEmailsAndUrls(content)
    numbersAndPhones_list = findNumbersAndPhones(content)
    words_list = findWords(content)
    content = abbreviations_list + emailsAndUrls_list + numbersAndPhones_list + words_list
    if stop_words_list != None:
        content = removeStopWords(content, stop_words_list)
    return content

In [48]:
def process_file(file: str, stemmer):
    content = ''
    with open(os.path.abspath(file), 'r', encoding='utf8') as f:
        content += f.read()
    terms = useStemmer(stemmer, tokenizer(content))
    unique_words = set(terms)
    counter_unique_tokens = 0
    for word in unique_words:
        counter_unique_tokens += 1 if terms.count(word) == 1 else 0 
    return counter_unique_tokens, unique_words


In [49]:
counter_unique_tokens_porter, unique_words_porter = process_file(CISI_FILE, 'porter')

In [50]:
counter_unique_tokens_lancaster, unique_words_lancaster = process_file(CISI_FILE, 'lancaster')

In [51]:
print(f'Tokens unicos Porter: {counter_unique_tokens_porter}')
print(f'Tokens unicos Lancaster: {counter_unique_tokens_lancaster}')

4157 3677


In [52]:
for word_porter, word_lancaster in zip_longest(unique_words_porter, unique_words_lancaster, fillvalue=''):
    print(word_porter, word_lancaster)

emerg emerg
midst midst
whose kozhin
plato potass
3035 progeny
thirteen plato
proof 3035
termin thirteen
abund proof
restructur termin
countri abund
object tessy
12000 dud
right esth
lump object
bolef 12000
inclin right
kohut lump
uneasi bolef
estat churchm
bandwidth inclin
honorif priv
them kohut
labrari particul
805 structurality
prosper germany
pen them
backbon penry
shoot jam
733 805
941 monopo
383 pen
674 backbon
heaviest shoot
scientif spit
subsystem 733
complaint 941
duncan 383
induc cas
botani 674
seeley heaviest
jardin subsystem
archivist complaint
yourself induc
sage intertemp
180 seeley
dod ukc
scholar jardin
cnic yourself
oppos zaltm
wherev belv
inspect nondeduc
interstiti dis
concess tru
landri 180
chomski dod
searchabl scholar
progress cnic
state oppos
242 wherev
green inspect
751 concess
841 num
process struggle
ruth dent
classificatori progress
toffler 242
579 comarom
pamela green
suit 751
774 841
weisman process
intrud jessic
unesco nons
dictionari ruth
1234 579
uninte