# Text Processing

##### Importing and installing the required libraries

In [None]:
!pip install nltk
nltk.download('punkt') # tokenizer package 
nltk.download('stopwords') # package containing the stopwords
nltk.download('wordnet') # wordnet package
nltk.download('omw-1.4') # open multilingual wordnet package

In [None]:
import re # regex for operations on the data
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

##### Importing the file having the text to be used for text processing 

###### (change the file name as per your need)

In [None]:
f = open('golden_ratio.txt', 'r')# here r means the file is opened in reading mode, that is we cannot make changes to the file
corpus = f.read()  # this will read the contents of the file
print(corpus)


##### Cleaning the text

In [None]:
# converting the text to lower case
lower_case = corpus.lower()
print("-------------------------\n THE CORPUS IN LOWER CASE \n-------------------------\n", lower_case)


#removing everything from the corpus except letters and numbers
removed_punctuation = re.sub(r"[^a-zA-Z0-9]"," ", lower_case) # this mean remove everything except a-z A-Z and 0-9 and replace them with a space
print("-------------------------------------------\n THE CORPUS AFTER REMOVING THE PUNCTUATIONS\n-------------------------------------------\n",removed_punctuation1)


# OR instead of regex we can manually remove the punctuations as
punctuations = string.punctuation
for punctuation in punctuations:
    removed_punctuation1= lower_case.replace(punctuation,"")
print("---------------------------------------------------------------\nTHE CORPUS AFTER REMOVING EVERYTHING EXCEPT LETTERS AND NUMBERS\n---------------------------------------------------------------\n",removed_punctuation)

##### Tokenization

In [None]:
# splitting using python (this will split the strings into a string based on spaces, tabs and new line)
words_list = removed_punctuation.split()
print("--------------------------\nTHE WORDS LIST USING SPLIT\n--------------------------\n",words_list)

# splitting using nltk command ( gives better output )
words_list1 = word_tokenize(removed_punctuation, language='english', preserve_line=False)
print("---------------------------\nWORDS USING NLTK TOKENIZER\n---------------------------\n", words_list1)

##### Removing the stop words from the list

In [None]:
# stop words are the frequently occuring words in a corpus which don't contribute much to the overall meaning of the text
stop_words=stopwords.words("english")
print("----------\nSTOP WORDS\n----------\n", stop_words)

removed_stopWords = [not_stop for not_stop in words_list1 if not_stop not in stop_words]
print("------------------------------------\nCORPUS AFTER REMOVING THE STOP WORDS\n------------------------------------\n", removed_stopWords)

##### Stemming & Lemmatization of words

In [None]:
# Stemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words
# Basically what stemming does is, it crudely cuts the inflectional part of the words
stemmed=[]
for word in removed_stopWords:
    stemmed.append(PorterStemmer().stem(word))
print("--------------------\nWORDS AFTER STEMMING\n--------------------\n", stemmed)


# It is also kindof similar to stemming but gives better results as it takes much more factors into consideration and uses a dictionary to get the base form of a word
lemmed=[]
for word in removed_stopWords:
    lemmed.append(WordNetLemmatizer().lemmatize(word))
print("-------------------------\nWORDS AFTER LEMMATIZATION\n-------------------------\n", lemmed)


# Lemmatization clearly identifies the base form of 'troubled' to 'trouble'' denoting some meaning whereas,
#  Stemming will cut out 'ed' part and convert it into 'troubl' which has the wrong meaning and spelling errors





##### Now the text is ready for feature extraction

In [None]:
f.close() #to close the file in python