<a href="https://colab.research.google.com/github/manalibhoir22/manali/blob/master/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Cleaning the data 
Remove ‘\n’

Remove emojis if any

Remove punctuation marks

Remove extra spaces

Remove stopwords — Stopwords are those words which occur very frequently but are not required for analysis as they provide no insights. Removing them will reduce computational load. They include words like I, me, myself, that, him, etc.

In [5]:
import string
import nltk 
from nltk.corpus import stopwords
from nltk import PorterStemmer
import re
from nltk.tokenize import word_tokenize

In [6]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [77]:
stop_words = ['in','of','at','a','the','and','is','on','an','they','was','it','i','them','to','these','this','for']

In [78]:
def decontracted(phrase):
    phrase=re.sub(r"won't","will not",phrase)
    phrase=re.sub(r"can't","can not",phrase)
    phrase=re.sub(r"n\'t"," not",phrase)
    phrase=re.sub(r"\'re"," are",phrase)
    phrase=re.sub(r"\'s"," is",phrase)
    phrase=re.sub(r"\'d"," would",phrase)
    phrase=re.sub(r"\'ll"," will",phrase)    
    phrase=re.sub(r"\'t"," not",phrase)
    phrase=re.sub(r"\'ve"," have",phrase)
    phrase=re.sub(r"\'m"," am",phrase)
    return phrase

#### USING SNOWBALL

In [79]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
nltk.download('wordnet')

##STEMMING USING SNOWBALL
snow_stemmer = SnowballStemmer(language='english')

#LEMMATIZATION
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [80]:
def clean_text(text):
    
    snow_stemmer = SnowballStemmer(language='english')
    
    lemmatizer = WordNetLemmatizer()
    
    text = decontracted(text)
    
    text = deEmojify(text) #remove Emojis
    
    text_cleaned = "".join([x for x in text if x not in string.punctuation]) #remove punctuation
    
    text_cleaned = re.sub(' +', ' ',text_cleaned) #remove extra spaces
    
    text_cleaned = text_cleaned.lower() #converting into lower case
    
    tokens = text_cleaned.split(" ")
    
    tokens = [token for token in tokens if token not in stop_words] #taking only those words which are not stop words 
    
    text_cleaned = " ".join([snow_stemmer.stem(token) for token in tokens])
    
    text_cleaned = " ".join([lemmatizer.lemmatize(token) for token in tokens])
    
    return text_cleaned

In [81]:
clean_text("i won't review this product i'd've gone for another, I'm veryyyy disappointed her's was good, i've and can't i'll be bad it's")


'will not review product would have gone another am veryyyy disappointed her good have can not will be bad'

In [13]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [87]:
text = ("These are awesome and make my phone look so stylish! I have only used one so far and I've had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! It's Great quality! :)")
clean_text(text)

'are awesome make my phone look so stylish have only used one so far have had almost year can you believe that one year great quality '

In [None]:
I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality! :)'
print(tokenized_word)

In [3]:
phrase = 'I am meeting him tomorrow at the meeting'
for word in phrase.split():
    print(word+' --> '+snow_stemmer.stem(word))

I --> i
am --> am
meeting --> meet
him --> him
tomorrow --> tomorrow
at --> at
the --> the
meeting --> meet


In [45]:
words = ['working','work','worked','run','runs','running','easily','fairly','universal','mice']

In [46]:
snow_stemmer = SnowballStemmer(language='english')
for word in words:
    print(f'{word:{10}} {snow_stemmer.stem(word):{10}} {lemmatizer.lemmatize(word)} ')

working    work       working 
work       work       work 
worked     work       worked 
run        run        run 
runs       run        run 
running    run        running 
easily     easili     easily 
fairly     fair       fairly 
universal  univers    universal 
better     better     better 
mice       mice       mouse 


In [75]:
print(f'\033[1m Word     Stem      Lemma \033[0m')

phrase = 'I am meeting him tomorrow at the meeting'
for word in phrase.split():
     print(f'{word:{10}} {snow_stemmer.stem(word):{10}}{lemmatizer.lemmatize(word)}')

[1m Word     Stem      Lemma [0m
I          i         I
am         am        am
meeting    meet      meeting
him        him       him
tomorrow   tomorrow  tomorrow
at         at        at
the        the       the
meeting    meet      meeting
