## Cleaning Text Data

In [1]:
import re

In [4]:
sentence = 'Sunil tweeted,"Witnessing 70th Republic Day of India from Rajpath,\
New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official\
@indian_army #India #70thRepublic_Day. For more photos ping me sunil@photoking.com :)"'

In [7]:
re.sub(r'([^\s\w]|_)+',' ',sentence).split()

['Sunil',
 'tweeted',
 'Witnessing',
 '70th',
 'Republic',
 'Day',
 'of',
 'India',
 'from',
 'Rajpath',
 'New',
 'Delhi',
 'Mesmerizing',
 'performance',
 'by',
 'Indian',
 'Army',
 'Awesome',
 'airshow',
 'india',
 'official',
 'indian',
 'army',
 'India',
 '70thRepublic',
 'Day',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil',
 'photoking',
 'com']

In [11]:
import re
def n_gram_extractor(sentence,n):
    tokens = re.sub(r'([^\s\w]|_)+',' ',sentence).split()
    for i in range(len(tokens)-n+1):
        print(tokens[i:i+n])

In [12]:
## Bi-gram
n_gram_extractor('The cute little boy is playing with the kitten.',2)

['The', 'cute']
['cute', 'little']
['little', 'boy']
['boy', 'is']
['is', 'playing']
['playing', 'with']
['with', 'the']
['the', 'kitten']


In [16]:
## Tri-grams
n_gram_extractor('The cute little boy is playing with the kitten.',3)

['The', 'cute', 'little']
['cute', 'little', 'boy']
['little', 'boy', 'is']
['boy', 'is', 'playing']
['is', 'playing', 'with']
['playing', 'with', 'the']
['with', 'the', 'kitten']


In [18]:
from nltk import ngrams
list(ngrams('The cute little boy is playing with the kitten.'.split(),2))

[('The', 'cute'),
 ('cute', 'little'),
 ('little', 'boy'),
 ('boy', 'is'),
 ('is', 'playing'),
 ('playing', 'with'),
 ('with', 'the'),
 ('the', 'kitten.')]

In [20]:
list(ngrams('The cute little boy is playing with the kitten.'.split(), 3))

[('The', 'cute', 'little'),
 ('cute', 'little', 'boy'),
 ('little', 'boy', 'is'),
 ('boy', 'is', 'playing'),
 ('is', 'playing', 'with'),
 ('playing', 'with', 'the'),
 ('with', 'the', 'kitten.')]

In [25]:
from textblob import TextBlob
blob = TextBlob("The cute little boy is playing witht the kitten")
blob.ngrams(n=2)

[WordList(['The', 'cute']),
 WordList(['cute', 'little']),
 WordList(['little', 'boy']),
 WordList(['boy', 'is']),
 WordList(['is', 'playing']),
 WordList(['playing', 'witht']),
 WordList(['witht', 'the']),
 WordList(['the', 'kitten'])]

In [26]:
blob.ngrams(n=3)

[WordList(['The', 'cute', 'little']),
 WordList(['cute', 'little', 'boy']),
 WordList(['little', 'boy', 'is']),
 WordList(['boy', 'is', 'playing']),
 WordList(['is', 'playing', 'witht']),
 WordList(['playing', 'witht', 'the']),
 WordList(['witht', 'the', 'kitten'])]

In [28]:
sentence = 'Sunil tweeted, "Witnessing 70th Republic Day of India from Rajpath, \
New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official \
@indian_army #India #70thRepublic_Day. For more photos ping me sunil@photoking.com :)"'

In [33]:
from keras.preprocessing.text import text_to_word_sequence
from textblob import TextBlob

In [35]:
text_to_word_sequence(sentence)

['sunil',
 'tweeted',
 'witnessing',
 '70th',
 'republic',
 'day',
 'of',
 'india',
 'from',
 'rajpath',
 'new',
 'delhi',
 'mesmerizing',
 'performance',
 'by',
 'indian',
 'army',
 'awesome',
 'airshow',
 'india',
 'official',
 'indian',
 'army',
 'india',
 '70threpublic',
 'day',
 'for',
 'more',
 'photos',
 'ping',
 'me',
 'sunil',
 'photoking',
 'com']

In [40]:
blob = TextBlob(sentence)
blob.words

WordList(['Sunil', 'tweeted', 'Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath', 'New', 'Delhi', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army', 'Awesome', 'airshow', 'india_official', 'indian_army', 'India', '70thRepublic_Day', 'For', 'more', 'photos', 'ping', 'me', 'sunil', 'photoking.com'])

In [44]:
sentence = 'Sunil tweeted, "Witnessing 70th Republic Day of India from Rajpath, \
New Delhi. Mesmerizing performance by Indian Army! Awesome airshow! @india_official \
@indian_army #India #70thRepublic_Day. For more photos ping me sunil@photoking.com :)"'

## TweetTokenizer

In [45]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokenizer.tokenize(sentence)

['Sunil',
 'tweeted',
 ',',
 '"',
 'Witnessing',
 '70th',
 'Republic',
 'Day',
 'of',
 'India',
 'from',
 'Rajpath',
 ',',
 'New',
 'Delhi',
 '.',
 'Mesmerizing',
 'performance',
 'by',
 'Indian',
 'Army',
 '!',
 'Awesome',
 'airshow',
 '!',
 '@india_official',
 '@indian_army',
 '#India',
 '#70thRepublic_Day',
 '.',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil@photoking.com',
 ':)',
 '"']

## MWE tokenizer

In [47]:
from nltk.tokenize import MWETokenizer
mwe_tokenizer = MWETokenizer([('Republic','Day')])
mwe_tokenizer.add_mwe(('Indian','Amy'))
mwe_tokenizer.tokenize(sentence.split())

['Sunil',
 'tweeted,',
 '"Witnessing',
 '70th',
 'Republic_Day',
 'of',
 'India',
 'from',
 'Rajpath,',
 'New',
 'Delhi.',
 'Mesmerizing',
 'performance',
 'by',
 'Indian',
 'Army!',
 'Awesome',
 'airshow!',
 '@india_official',
 '@indian_army',
 '#India',
 '#70thRepublic_Day.',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil@photoking.com',
 ':)"']

## regular expression tokenizer

In [49]:
from nltk.tokenize import RegexpTokenizer
reg_tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
reg_tokenizer.tokenize(sentence)

['Sunil',
 'tweeted',
 ',',
 '"Witnessing',
 '70th',
 'Republic',
 'Day',
 'of',
 'India',
 'from',
 'Rajpath',
 ',',
 'New',
 'Delhi',
 '.',
 'Mesmerizing',
 'performance',
 'by',
 'Indian',
 'Army',
 '!',
 'Awesome',
 'airshow',
 '!',
 '@india_official',
 '@indian_army',
 '#India',
 '#70thRepublic_Day.',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil',
 '@photoking.com',
 ':)"']

## whitespace Tokenizer

In [59]:
from nltk.tokenize import WhitespaceTokenizer
wh_tokenizer = WhitespaceTokenizer()
wh_tokenizer.tokenize(sentence)

['Sunil',
 'tweeted,',
 '"Witnessing',
 '70th',
 'Republic',
 'Day',
 'of',
 'India',
 'from',
 'Rajpath,',
 'New',
 'Delhi.',
 'Mesmerizing',
 'performance',
 'by',
 'Indian',
 'Army!',
 'Awesome',
 'airshow!',
 '@india_official',
 '@indian_army',
 '#India',
 '#70thRepublic_Day.',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil@photoking.com',
 ':)"']

## word punkt


In [52]:
from nltk.tokenize import WordPunctTokenizer
wp_tokenizer = WordPunctTokenizer()
wp_tokenizer.tokenize(sentence)

['Sunil',
 'tweeted',
 ',',
 '"',
 'Witnessing',
 '70th',
 'Republic',
 'Day',
 'of',
 'India',
 'from',
 'Rajpath',
 ',',
 'New',
 'Delhi',
 '.',
 'Mesmerizing',
 'performance',
 'by',
 'Indian',
 'Army',
 '!',
 'Awesome',
 'airshow',
 '!',
 '@',
 'india_official',
 '@',
 'indian_army',
 '#',
 'India',
 '#',
 '70thRepublic_Day',
 '.',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil',
 '@',
 'photoking',
 '.',
 'com',
 ':)"']

In [60]:
sentence = "I love playing football"

In [61]:
from nltk.stem import RegexpStemmer
regex_stemmer = RegexpStemmer('ing$',min=4)
' '.join([regex_stemmer.stem(wd) for wd in sentence.split()])

'I love play football'

## porter stemmer

In [63]:
sentence = "Before eating,it would be nice to sanitize your hands with a sanitizer"
from nltk.stem.porter import *

In [66]:
ps_stemmer = PorterStemmer()
' '.join([ps_stemmer.stem(wd) for wd in sentence.split()])

'befor eating,it would be nice to sanit your hand with a sanit'

## lemmatization

In [67]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
sentence = "The products produced by the process today are far better than what it produces generally"

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lizey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [68]:
' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(sentence)])

'The product produced by the process today are far better than what it produce generally'

## Singularizing and Pluralizing Words

In [69]:
from textblob import TextBlob
sentence = TextBlob('She sells seashells on the seashore')

In [70]:
sentence.words

WordList(['She', 'sells', 'seashells', 'on', 'the', 'seashore'])

In [72]:
sentence.words[1].singularize()

'sell'

In [73]:
sentence.words[5].pluralize()

'seashores'

## language translation

In [75]:
from textblob import TextBlob

In [120]:
en_blob = TextBlob('hello! zeyu li')
en_blob.translate(from_lang='en',to = 'zh-CN')

TextBlob("你好！李泽宇")

## STOP-WORD REMOVAL

In [90]:
from nltk import word_tokenize
sentence = "She sells seashells on the seashore"

In [93]:
custom_stop_word_list = ['she','on','the','am','is','not']
' '.join([word for word in word_tokenize(sentence) if word.lower() not in custom_stop_word_list])

'sells seashells seashore'

## EXTRACTING GENERAL FEATURES FROM RAW TEXTFeature Extraction from Texts

In [97]:
import pandas as pd

In [99]:
df = pd.DataFrame([['The interim budget for 2019 will be announced on 1st February.'], ['Do you know how much expectation the middle-class working population is having from this budget?'], ['February is the shortest month in a year.'], ['This financial year will end on 31st March.']])
df.columns = ['text']
df

Unnamed: 0,text
0,The interim budget for 2019 will be announced ...
1,Do you know how much expectation the middle-cl...
2,February is the shortest month in a year.
3,This financial year will end on 31st March.


In [100]:
#count words
from textblob import TextBlob
df['number_of_words'] = df['text'].apply(lambda x : len(TextBlob(str(x)).words))
df['number_of_words']

0    11
1    15
2     8
3     8
Name: number_of_words, dtype: int64

In [114]:
# sentence contain wh-
wh_words = set(['why', 'who', 'which', 'what', 'where', 'when', 'how'])
df['is_wh_words_present'] = df['text'].apply(lambda x : True if\
                                            len(set(TextBlob(str(x)).words).intersection(wh_words))>0
                                            else False)

In [115]:
df['is_wh_words_present']

0    False
1     True
2    False
3    False
Name: is_wh_words_present, dtype: bool

In [116]:
#extract their sentiment scores:
df['polarity'] = df['text'].apply(lambda x : TextBlob(str(x)).sentiment.polarity)
df['polarity']

0    0.0
1    0.2
2    0.0
3    0.0
Name: polarity, dtype: float64

In [118]:
#extract their subjectivity scores:
df['subjectivity'] = df['text'].apply(lambda x :TextBlob(str(x)).sentiment.subjectivity)
df['subjectivity']

0    0.0
1    0.2
2    0.0
3    0.0
Name: subjectivity, dtype: float64

In [122]:
df['language']=df['text'].apply(lambda x :
                               TextBlob(str(x)).detect_language())
df['language']

0    en
1    en
2    en
3    en
Name: language, dtype: object

In [123]:
df

Unnamed: 0,text,number_of_words,is_wh_words_present,polarity,subjectivity,language
0,The interim budget for 2019 will be announced ...,11,False,0.0,0.0,en
1,Do you know how much expectation the middle-cl...,15,True,0.2,0.2,en
2,February is the shortest month in a year.,8,False,0.0,0.0,en
3,This financial year will end on 31st March.,8,False,0.0,0.0,en


## EXTRACTING GENERAL FEATURES FROM TEXT

In [124]:
import pandas as pd
import nltk

In [129]:
data = pd.read_csv('data.csv')

In [134]:
data['number_letters'] = data['text'].apply(lambda x:
                                          len(TextBlob(str(x))))
data['number_letters']

0      23
1      21
2      36
3      18
4      15
5      20
6      52
7      83
8      63
9      54
10     90
11     71
12     67
13    129
14     92
Name: number_letters, dtype: int64

In [133]:
digits

0      23
1      21
2      36
3      18
4      15
5      20
6      52
7      83
8      63
9      54
10     90
11     71
12     67
13    129
14     92
Name: number_letters, dtype: int64