## Text_Parsing test


In [33]:
test_text = "This is a sample text. It contains some words, and stop words like 'the' and 'is'."
original_text = test_text

In [34]:
def print_comparison():
    print("Original text: ", original_text)
    print("Test text:     ", test_text)

### First lowercasing text

In [35]:
test_text = test_text.lower()
print_comparison()

Original text:  This is a sample text. It contains some words, and stop words like 'the' and 'is'.
Test text:      this is a sample text. it contains some words, and stop words like 'the' and 'is'.


### Removing punctuation and numbers

In [36]:
import re # regular expressions

In [37]:
# should I remove numbers too or just letters? [^a-zA-Z0-9] --- Will probably depend on each use case
test_text = re.sub(r"[^a-zA-Z]", " ", test_text)
print_comparison()

Original text:  This is a sample text. It contains some words, and stop words like 'the' and 'is'.
Test text:      this is a sample text  it contains some words  and stop words like  the  and  is  


### Removing stopwords

In [38]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kllmm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
sw = stopwords.words("english")
sw[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [40]:
test_text = test_text.split()
test_text[:10]

['this', 'is', 'a', 'sample', 'text', 'it', 'contains', 'some', 'words', 'and']

In [41]:
for word in test_text:
    if word in sw:
        test_text.remove(word)
test_text[:10]

['sample', 'text', 'contains', 'words', 'stop', 'words', 'like', 'and', 'is']

### Lemmatization or Stemming

In [42]:
# using stemming for speed
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [43]:
for i in range(len(test_text)):
    test_text[i] = ps.stem(test_text[i])
test_text[:10]

['sampl', 'text', 'contain', 'word', 'stop', 'word', 'like', 'and', 'is']

In [44]:
test_text = " ".join(test_text)
test_text

'sampl text contain word stop word like and is'

In [45]:
print_comparison()

Original text:  This is a sample text. It contains some words, and stop words like 'the' and 'is'.
Test text:      sampl text contain word stop word like and is


In [46]:
dictionary = {}

for word in test_text.split():
    if word in dictionary:
        dictionary[word] += 1
    else:
        dictionary[word] = 1
dictionary

{'sampl': 1,
 'text': 1,
 'contain': 1,
 'word': 2,
 'stop': 1,
 'like': 1,
 'and': 1,
 'is': 1}

In [47]:
def text_parse(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]',' ', text)
    text = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [ps.stem(word) for word in text if word not in stop_words]
    return ' '.join(tokens)
text = text_parse(original_text)
print(text)


sampl text contain word stop word like


### Counter test


In [48]:
from collections import Counter

tokens = text.split()
most_ocurrence = {word: tokens.count(word) for word in tokens}
print(most_ocurrence)
print(Counter(tokens))

sorted_count = sorted(most_ocurrence.items(), \
    key=lambda val: val[1], reverse=True)
print(sorted_count)

for word, count in sorted_count[:5]:
    print(f"{word}: {count}")



{'sampl': 1, 'text': 1, 'contain': 1, 'word': 2, 'stop': 1, 'like': 1}
Counter({'word': 2, 'sampl': 1, 'text': 1, 'contain': 1, 'stop': 1, 'like': 1})
[('word', 2), ('sampl', 1), ('text', 1), ('contain', 1), ('stop', 1), ('like', 1)]
word: 2
sampl: 1
text: 1
contain: 1
stop: 1


In [49]:
def word_freq_analysis(text, top_n=5):
    tokens= text.split()
    words_count= Counter(tokens)

    sorted_count = sorted(words_count.items(), \
         key=lambda val:val[1], reverse=True)

    for word, count in sorted_count[:top_n]:
        print(f'{word}: {count}')

In [50]:
text_file = './gutenberg.org_cache_epub_71894_pg71894.txt'

In [52]:
with open(text_file, 'r', encoding='utf-8') as file:
    text = file.read()

gutt_book = text_parse(text)
word_freq_analysis(gutt_book,10)


hellen: 264
cyru: 205
king: 176
great: 175
would: 171
could: 155
one: 154
day: 133
soldier: 129
time: 124
