# Clean text for NLP

* [Guide to CleanText: A Python Package to Clean Raw Text Data](https://analyticsindiamag.com/guide-to-cleantext-a-python-package-to-clean-raw-text-data/)

```
from cleantext import clean

clean("some input",
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=False,                  # replace all URLs with a special token
    no_emails=False,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"                       # set to 'de' for German special handling
)
```

In [4]:
!pip install clean-text unidecode

Collecting unidecode
  Downloading Unidecode-1.2.0-py2.py3-none-any.whl (241 kB)
[K     |████████████████████████████████| 241 kB 3.3 MB/s eta 0:00:01
Installing collected packages: unidecode
Successfully installed unidecode-1.2.0


In [15]:
import re
from cleantext import clean

In [68]:
sentence = """
Zürich has a famous website https://www.zuerich.com/ 
WHICH ACCEPTS 40,000 € and adding a random string, :
abc123def456ghi789zero0 for this demo. !!!&*^% tako.hoge@gmail.com' 
I Won't !*%$^&*#$#!!! ?? ? ~!@#$%^&*()_=+\[\]{}\\\|;:\-"\'<>.,/? pierod.
"""

def clean_text(sentences):
    sentences = re.sub(r'[~=+|<>.^]+', "", sentences)
    sentences = clean(sentences,
        fix_unicode=True,               # fix various unicode errors
        to_ascii=True,                  # transliterate to closest ASCII representation
        lower=True,                     # lowercase text
        no_line_breaks=True,            # fully strip line breaks as opposed to only normalizing them
        no_urls=True,                   # replace all URLs with a special token
        no_emails=True,                 # replace all email addresses with a special token
        no_phone_numbers=True,          # replace all phone numbers with a special token
        no_numbers=True,                # replace all numbers with a special token
        no_digits=True,                 # replace all digits with a special token
        no_currency_symbols=True,       # replace all currency symbols with a special token
        no_punct=True,                  # remove punctuations
        replace_with_punct="",          # instead of removing punctuations you may replace them
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="",
        replace_with_digit="",
        replace_with_currency_symbol="",
        lang="en"                       # set to 'de' for German special handling
    )
    return sentences

clean_text_comment(sentence)

'zurich has a famous website httpswwwzuerichcom which accepts and adding a random string abcdefghizero for this demo takohogegmailcom i wont pierod'

In [39]:
def decontracted(sentences):
    sentences = sentences.lower()
    # specific
    sentences = re.sub(r"won\'t", "will not", sentences)
    sentences = re.sub(r"can\'t", "can not", sentences)
    # general
    sentences = re.sub(r"n\'t", " not", sentences)
    sentences = re.sub(r"\'re", " are", sentences)
    sentences = re.sub(r"\'s", " is", sentences)
    sentences = re.sub(r"\'d", " would", sentences)
    sentences = re.sub(r"\'ll", " will", sentences)
    sentences = re.sub(r"\'t", " not", sentences)
    sentences = re.sub(r"\'ve", " have", sentences)
    sentences = re.sub(r"\'m", " am", sentences)
    return sentences


In [38]:
clean_text_comment(decontracted("I Won't !!!!"))

'i will not'