# Text cleaning with regular expressions

https://docs.python.org/3.8/library/re.html

RegEx tutorial: https://medium.com/factory-mind/regex-tutorial-a-simple-cheatsheet-by-examples-649dc1c3f285

In [1]:
import re
import string

In [2]:
text = 'Noisy Text Example with a lot of numbers and special symbols: 12345, #abc, @@qwerty, http://xyz. Data cleaning Done'

In [3]:
# Converting to lowercase
text = text.lower()
text

'noisy text example with a lot of numbers and special symbols: 12345, #abc, @@qwerty, http://xyz. data cleaning done'

In [4]:
# remove mentions
text = re.sub("@\S+", " ", text)
text

'noisy text example with a lot of numbers and special symbols: 12345, #abc,   http://xyz. data cleaning done'

In [5]:
# remove url
text = re.sub("https*\S+", " ", text)
text

'noisy text example with a lot of numbers and special symbols: 12345, #abc,     data cleaning done'

In [6]:
# remove hashtags
text = re.sub("#\S+", " ", text)
print(text)

noisy text example with a lot of numbers and special symbols: 12345,       data cleaning done


In [7]:
# remove all numbers
text = re.sub("\d", " ", text)  
text

'noisy text example with a lot of numbers and special symbols:      ,       data cleaning done'

In [8]:
# remove punctuations
text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
text

'noisy text example with a lot of numbers and special symbols               data cleaning done'

In [9]:
# remove extra spaces
text = re.sub('\s{2,}', " ", text)
text

'noisy text example with a lot of numbers and special symbols data cleaning done'

In [10]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
text = ' '.join([word for word in text.split(' ') if word not in stop_words])
text

'noisy text example lot numbers special symbols data cleaning done'

In [11]:
# example of text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub("@\S+", " ", text)
    text = re.sub("https*\S+", " ", text)
    text = re.sub("#\S+", " ", text)
    text = re.sub("\d", " ", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\s{2,}',' ', text)
    text = ' '.join([word for word in text.split(' ') if word not in stop_words])
    return text