In [1]:
import numpy as np
np.random.seed(123)

In [2]:
from IPython import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Get Data

In [3]:
with open('Text/AliceInWonderLand.txt', 'r', encoding='utf-8') as f:
    text = f.read().lower()
len(text)
type(text)
text[:100]

163817

str

'\ufeffproject gutenberg’s alice’s adventures in wonderland, by lewis carroll\n\nthis ebook is for the use o'

Can get Text directly also as

In [4]:
text = open('Text/AliceInWonderLand.txt', encoding='utf-8').read().lower()
len(text)
type(text)
text[:100]

163817

str

'\ufeffproject gutenberg’s alice’s adventures in wonderland, by lewis carroll\n\nthis ebook is for the use o'

In [5]:
def fun_percent_change(raw_text, mod_text):
    import numpy as np
    
    pct_change = ((len(raw_text) - len(mod_text)) / len(raw_text))*100
    return print('Percent Change is: ', np.round(pct_change, 2))

# Clean Data Help

### 1) Stopwords + Punctuation

In [29]:
from nltk.corpus import stopwords
from string import punctuation

stop = stopwords.words('english')

len(stop)
len(punctuation)

_stopwords = stop + list(punctuation)
len(_stopwords)
_stopwords[:20]

179

32

211

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [7]:
clean_text = ' '.join(_text for _text in text.split() if _text not in _stopwords)
len(clean_text)
fun_percent_change(text, clean_text)
clean_text[:100]

111242

Percent Change is:  32.09


'\ufeffproject gutenberg’s alice’s adventures wonderland, lewis carroll ebook use anyone anywhere cost alm'

### 2) Lemmatization

#### Basis NLTK's WordNetLemmatizer

In [8]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

In [9]:
clean_text_lem_nltk = ''
clean_text_lem_nltk = ' '.join(lem.lemmatize(_text, pos='v') for _text in clean_text.split())
len(clean_text_lem_nltk)
fun_percent_change(clean_text, clean_text_lem_nltk)
clean_text_lem_nltk[:100]

106691

Percent Change is:  4.09


'\ufeffproject gutenberg’s alice’s adventure wonderland, lewis carroll ebook use anyone anywhere cost almo'

#### Basis 'textblob's' Lemmatizer

In [10]:
from textblob import Word

In [11]:
clean_text_lem_blob = ''
clean_text_lem_blob = ' '.join(Word(_text).lemmatize() for _text in clean_text.split())
len(clean_text_lem_blob)
fun_percent_change(clean_text, clean_text_lem_blob)
clean_text_lem_blob[:100]

110584

Percent Change is:  0.59


'\ufeffproject gutenberg’s alice’s adventure wonderland, lewis carroll ebook use anyone anywhere cost almo'

# 3) Unwanted Characters Removal

In [12]:
to_remove = ['\ufeff','\n','`','~','@','#','%','^','*','--']
clean_text_char_rem = ''
for i,char in enumerate(to_remove):
    if i == 0:
        clean_text_char_rem = clean_text_lem_nltk.replace(char, '')
    else:
        clean_text_char_rem = clean_text_char_rem.replace(char, '')
len(clean_text_char_rem)
clean_text_char_rem[:100]

106126

'project gutenberg’s alice’s adventure wonderland, lewis carroll ebook use anyone anywhere cost almos'

In [13]:
fun_percent_change(clean_text_lem_nltk, clean_text_char_rem)
fun_percent_change(clean_text, clean_text_char_rem)
fun_percent_change(text, clean_text_char_rem)

Percent Change is:  0.53
Percent Change is:  4.6
Percent Change is:  35.22


# 4) Common Word Removal

In [14]:
clean_text_char_rem.split()[:10]

['project',
 'gutenberg’s',
 'alice’s',
 'adventure',
 'wonderland,',
 'lewis',
 'carroll',
 'ebook',
 'use',
 'anyone']

In [15]:
import pandas as pd
common_word_freq = pd.Series(clean_text_char_rem.split()).value_counts()
len(common_word_freq)
common_word_freq[:20]

4929

say        471
alice      221
go         160
‘i         122
little     120
think      111
get        109
look       100
project     82
one         81
make        78
like        78
alice,      76
would       76
begin       76
come        70
could       66
see         64
know        64
work        62
dtype: int64

In [16]:
common_word_freq[0]
common_word_freq[2]
# Checking for 1st 20 words to be removed
common_word_freq.index[:20]
clean_text_post_commonWord_removal = ''
clean_text_post_commonWord_removal = ' '.join(_char for _char in clean_text_char_rem.split() 
                                              if _char not in common_word_freq.index[:20])
len(clean_text_post_commonWord_removal)
fun_percent_change(text, clean_text_post_commonWord_removal)

471

160

Index(['say', 'alice', 'go', '‘i', 'little', 'think', 'get', 'look', 'project',
       'one', 'make', 'like', 'alice,', 'would', 'begin', 'come', 'could',
       'see', 'know', 'work'],
      dtype='object')

94784

Percent Change is:  42.14


In [17]:
clean_text_post_commonWord_removal[:100]

'gutenberg’s alice’s adventure wonderland, lewis carroll ebook use anyone anywhere cost almost restri'

# 5) Rare Word Removal

In [18]:
rare_word_freq = pd.Series(clean_text_post_commonWord_removal.split()).value_counts()
len(rare_word_freq)
rare_word_freq[-10:]

4909

fact.        1
boldly:      1
bat          1
altered.’    1
honour,      1
pop          1
five.        1
hookah,      1
‘with        1
way!         1
dtype: int64

In [19]:
rare_word_freq.index[-20:]

Index(['were’,', 'tea-time.', '[‘the', 'us!”’', '‘herald,', 'gross', 'sugar',
       'caper', 'stick', 'inaccurate', 'fact.', 'boldly:', 'bat', 'altered.’',
       'honour,', 'pop', 'five.', 'hookah,', '‘with', 'way!'],
      dtype='object')

In [20]:
clean_text_post_rareWord_removal = ''
clean_text_post_rareWord_removal = ' '.join(_char for _char in clean_text_post_commonWord_removal.split() 
                                            if _char not in rare_word_freq.index[-20:])
len(clean_text_post_rareWord_removal)
fun_percent_change(text, clean_text_post_rareWord_removal)

94646

Percent Change is:  42.22


# Putting all this in a single function which can be called externally in any program

In [21]:
def fun_get_data():
    # Get Data
    print('<< 1. Reading Data >>')
    with open('Text/AliceInWonderLand.txt', 'r', encoding='utf-8') as f:
        text = f.read().lower()
    print('Actual Length of Text : ', len(text))
    return text

In [22]:
def fun_clean_stopwords_punct(original_text):
    print('*'*50)
    print('<< 2. Applying StopWords and Punctuation removal >>')
    
    from nltk.corpus import stopwords
    from string import punctuation
    
    stop = stopwords.words('english')
    _stopwords = set(stop + list(punctuation))
    clean_text = ' '.join(_text for _text in original_text.split() if _text not in _stopwords)
    
    print('Length post removing stopwords and punctuation : ',len(clean_text))
    print(fun_percent_change(original_text, clean_text))
    return clean_text

In [23]:
def fun_clean_lemmatization(passedText, original_text):
    print('*'*50)
    print('<< 3. Applying Lemmatization >>')
    
    from nltk.stem import WordNetLemmatizer
    
    lem = WordNetLemmatizer()
    clean_text = ' '.join(lem.lemmatize(_text, pos='v') for _text in passedText.split())
    
    print('Length of text post applying Lemmatization : ', len(clean_text))
    print(fun_percent_change(original_text, clean_text))
    return clean_text

In [24]:
def fun_clean_removing_unwantedWords(passedText, original_text):
    print('*'*50)
    print('<< 4. Applying Removal of Unwanted Characters >>')
    
    to_remove = ['\ufeff','\n','`','~','@','#','%','^','*','--']
    clean_text_char_rem = ''
    
    for i,char in enumerate(to_remove):
        if i == 0:
            clean_text = passedText.replace(char, '')
        else:
            clean_text = clean_text.replace(char, '')
    
    print('Length of text post removal of Unwanted Characters : ', len(clean_text))
    print(fun_percent_change(original_text, clean_text))
    return clean_text

In [25]:
def fun_clean_removing_commonWords(passedText, original_text):
    print('*'*50)
    print('<< 5. Applying Removal of Common or Frequently Occuring Words >>')
    
    import pandas as pd
    
    common_word_freq = pd.Series(passedText.split()).value_counts()[:20] # taking only 1st 20 words
    
    clean_text = ' '.join(_char for _char in passedText.split() if _char not in common_word_freq.index)
    print('Length of text post removal of Common Words : ', len(clean_text))
    print(fun_percent_change(original_text, clean_text))
    return clean_text

In [26]:
def fun_clean_removing_rareWords(passedText, original_text):
    print('*'*50)
    print('<< 6. Applying Removal of Rare Occuring Words >>')
    
    import pandas as pd
    
    common_word_freq = pd.Series(passedText.split()).value_counts()[-20:] # taking only last 20 words
    
    clean_text = ' '.join(_char for _char in passedText.split() if _char not in common_word_freq.index)
    print('Length of text post removal of Rare Orccuring Words : ', len(clean_text))
    print(fun_percent_change(original_text, clean_text))
    return clean_text

In [27]:
def fun_clean_text():    
    # Get Data
    original_text = fun_get_data()
    
    # Clean Data
    # 1: Remove Stop Words and Puctuation
    clean_text = fun_clean_stopwords_punct(original_text)
    
    # 2: Lemmatization
    clean_text_lem_nltk = fun_clean_lemmatization(clean_text, original_text)
    
    # 3. Removing unwanted text
    clean_text_char_rem = fun_clean_removing_unwantedWords(clean_text_lem_nltk, original_text)
    
    # 4. Removing Common Words
    clean_text_post_commonWord_removal = fun_clean_removing_commonWords(clean_text_char_rem, original_text)
    
    # 5. Removing Rare Occuring Words
    clean_text_post_rareWord_removal = fun_clean_removing_rareWords(clean_text_post_commonWord_removal, original_text)
    
    return clean_text_post_rareWord_removal

In [28]:
clean_text = fun_clean_text()

<< 1. Reading Data >>
Actual Length of Text :  163817
**************************************************
<< 2. Applying StopWords and Punctuation removal >>
Length post removing stopwords and punctuation :  111242
Percent Change is:  32.09
None
**************************************************
<< 3. Applying Lemmatization >>
Length of text post applying Lemmatization :  106691
Percent Change is:  34.87
None
**************************************************
<< 4. Applying Removal of Unwanted Characters >>
Length of text post removal of Unwanted Characters :  106126
Percent Change is:  35.22
None
**************************************************
<< 5. Applying Removal of Common or Frequently Occuring Words >>
Length of text post removal of Common Words :  94784
Percent Change is:  42.14
None
**************************************************
<< 6. Applying Removal of Rare Occuring Words >>
Length of text post removal of Rare Orccuring Words :  94646
Percent Change is:  42.22
None
