In [1]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
!pip install nltk

In [None]:
nltk.download('stopwords')

#### Let's clean the text below by stemming, removing stop words and using `clean_str(raw_text)`

In [2]:
example_1="88*&6$#456999976674452345 Another normalization technique we'll apply is stemming, using the nltk 988756 h %6$@@56(*8^43@)"

#### Text stemming using `nltk.stem SnowballStemmer` :
*  --Another normalization technique we'll apply is stemming, using the nltk port stemmer and lemmintizer.
     The reason we are using this library is becaue we need to normalize our text input.
*    Here is a good article that explains the technique: https://towardsdatascience.com/stemming-corpus-with-nltk-7a6a6d02d3e5
*    We'll use snowball stemmer.

#### example from the article posted above:
`from nltk.stem import SnowballStemmer`

`snowball = SnowballStemmer(language='english')`

* As mentioned in the article above some words like `amazing` stem to nonsensical words like `amaz`.

In [3]:
>>> from nltk.stem import SnowballStemmer
>>> snowball = SnowballStemmer(language='english')
>>> snowball.stem('amazing') 

'amaz'

#### Removing stopwords article: https://pythonspot.com/nltk-stop-words/
* Another technique to help normalize our text is by removing stopwords.
* Stopwords are words that occour very often. 

*Quote from article above: "" The stopwords are a list of words that are very very common but don’t provide useful information for most text analysis procedures. While it is helpful for understand the structure of sentences, it does not help you understand the semantics of the sentences themselves. Here’s a list of most commonly used words in English:
"" `N = [ 'stop', 'the', 'to', 'and', 'a', 'in', 'it', 'is', 'I', 'that', 'had', 'on', 'for', 'were', 'was']`

In [4]:
>>> import nltk
>>> nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/m9/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

#### Text cleaning :
* Removes numbers and special chars using regex `re`
* strips and turns text into lower case.
* removes punctuation 

In [6]:
def clean_str2(string):
    string = re.sub(r"[^A-Za-z0-9+\./()!?\'\`%$]", " ", string)
    return string.replace("\\n", " ")

def text_process(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

def clean_str(string):
    """
    string cleaning (partially modified)
    """
    string = re.sub(r"[^A-Za-z0-9()!?\'\`%$]", " ", string) 
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\$", " $ ", string) #yes, isolate $
    string = re.sub(r"\%", " % ", string) #yes, isolate %
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"(^|\W)\d+", "", string)
    # fixing XX X and xxx like as word
    string = re.sub(r'\S*(x{2,}|X{2,})\S*',"xxx",string)
    # removing non ascii
    string = re.sub(r'[^\x00-\x7F]+', "", string)
    return string.strip().lower()

def pre_process_text(text):
    text=clean_str(text)
    text=snowball.stem(text)
    text=' '.join(text_process(text))
    return text

In [7]:
pre_process_text(example_1)

'another normalization technique apply stemming using nltk h'