In [1]:
import numpy as np
import pandas as pd

# IMDB data acquisition

In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## 1. Lowercasing

In [3]:
df['review'] = df['review'].str.lower()
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


## 2. Remove HTML Tags

In [4]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)
remove_html_tags('<html>dddas</html>')

'dddas'

In [5]:
df['review'] = df['review'].apply(remove_html_tags)
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


## 3. Remove Links

In [6]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|WWW\.\S+')
    return pattern.sub(r'', text)


In [7]:
text1 = 'https://www.kaggle.com/code/campusx/text-preprocessing/script'

In [8]:
remove_url(text1)

''

In [9]:
df['review'] = df['review'].apply(remove_url)

## 4. Remove Punctuation

In [10]:
import string, time

In [11]:
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
text = 'Text. with. Puncutation?'

In [13]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

In [14]:
start = time.time()
print(remove_punc(text))
print((time.time()-start)*50000)

Text with Puncutation
8.988380432128906


In [15]:
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [16]:
start = time.time()
print(remove_punc1(text))
print((time.time()-start)*50000)

Text with Puncutation
16.09325408935547


In [17]:
df['review'] = df['review'].apply(remove_punc1)

## 5. Chat Word Treatment

In [18]:
chat_words = {'AFAIK': 'As Far As I Know', 'AFK': 'Away From Keyboard', 'ASAP': 'As Soon As Possible', 'ATK': 'At The Keyboard', 'ATM': 'At The Moment', 'A3': 'Anytime, Anywhere, Anyplace', 'BAK': 'Back At Keyboard', 'BBL': 'Be Back Later', 'BBS': 'Be Back Soon', 'BFN': 'Bye For Now', 'B4N': 'Bye For Now', 'BRB': 'Be Right Back', 'BRT': 'Be Right There', 'BTW': 'By The Way', 'B4': 'Before', 'CU': 'See You', 'CUL8R': 'See You Later', 'CYA': 'See You', 'FAQ': 'Frequently Asked Questions', 'FC': 'Fingers Crossed', 'FWIW': "For What It's Worth", 'FYI': 'For Your Information', 'GAL': 'Get A Life', 'GG': 'Good Game', 'GN': 'Good Night', 'GMTA': 'Great Minds Think Alike', 'GR8': 'Great!', 'G9': 'Genius', 'IC': 'I See', 'ICQ': 'I Seek you (also a chat program)', 'ILU': 'ILU: I Love You', 'IMHO': 'In My Honest/Humble Opinion', 'IMO': 'In My Opinion', 'IOW': 'In Other Words', 'IRL': 'In Real Life', 'KISS': 'Keep It Simple, Stupid', 'LDR': 'Long Distance Relationship', 'LMAO': 'Laugh My A.. Off', 'LOL': 'Laughing Out Loud', 'LTNS': 'Long Time No See', 'L8R': 'Later', 'MTE': 'My Thoughts Exactly', 'M8': 'Mate', 'NRN': 'No Reply Necessary', 'OIC': 'Oh I See', 'PITA': 'Pain In The A..', 'PRT': 'Party', 'PRW': 'Parents Are Watching', 'ROFL': 'Rolling On The Floor Laughing', 'ROFLOL': 'Rolling On The Floor Laughing Out Loud', 'ROTFLMAO': 'Rolling On The Floor Laughing My A.. Off', 'SK8': 'Skate', 'STATS': 'Your sex and age', 'ASL': 'Age, Sex, Location', 'THX': 'Thank You', 'TTFN': 'Ta-Ta For Now!', 'TTYL': 'Talk To You Later', 'U': 'You', 'U2': 'You Too', 'U4E': 'Yours For Ever', 'WB': 'Welcome Back', 'WTF': 'What The F...', 'WTG': 'Way To Go!', 'WUF': 'Where Are You From?', 'W8': 'Wait...', '7K': 'Sick:-D Laugher'}


In [19]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return ' '.join(new_text)

In [20]:
chat_conversion('IMHO he is the best')

'In My Honest/Humble Opinion he is the best'

In [21]:
df['review'] = df['review'].apply(chat_conversion)

## 6. Spelling Correction

In [22]:
! pip install textblob



In [24]:
from textblob import Textblob

ImportError: cannot import name 'Textblob' from 'textblob' (/home/manoj/anaconda3/lib/python3.9/site-packages/textblob/__init__.py)

## 7. Removing stop words

In [28]:
from nltk.corpus import stopwords

In [29]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [30]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [31]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    return ' '.join(new_text)

In [32]:
remove_stopwords(df['review'][3])

'basically theres  family   little boy jake thinks theres  zombie   closet  parents  fighting   timethis movie  slower   soap opera  suddenly jake decides  become rambo  kill  zombieok first    youre going  make  film  must decide    thriller   drama   drama  movie  watchable parents  divorcing arguing like  real life     jake   closet  totally ruins   film  expected  see  boogeyman similar movie  instead  watched  drama   meaningless thriller spots3   10    well playing parents descent dialogs    shots  jake  ignore '

In [33]:
# df['review'] = df['review'].apply(remove_stopwords)

## 8. Remove Emoji

In [34]:
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols and pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [35]:
remove_emoji('lmao 😛😅😂!')

'lmao !'

In [36]:
!pip install emoji



In [37]:
df['review'] = df['review'].apply(remove_emoji)

In [38]:
import emoji
print(emoji.demojize('😛😅😂'))

:face_with_tongue::grinning_face_with_sweat::face_with_tears_of_joy:


## 9. Tokenization

### 1. Split()

In [39]:
# word tokenization
sent1 = 'I am going to Delhi'
sent1.split()

['I', 'am', 'going', 'to', 'Delhi']

In [40]:
# sentence tokenization
sent2 = 'I am going to Delhi. I will stay there for 3 days. lets hope trip to be great'
sent2.split('.')

['I am going to Delhi',
 ' I will stay there for 3 days',
 ' lets hope trip to be great']

In [41]:
#problem with the split function
sent3 = 'I am going to delhi!'
sent3.split()

['I', 'am', 'going', 'to', 'delhi!']

In [42]:
sent4 = 'Where do you think I should go? I have 3 day holiday'
sent4.split('.')

['Where do you think I should go? I have 3 day holiday']

## 2. Regular Expression

In [43]:
import re
tokens = re.findall('[\w]+',sent3)
tokens

['I', 'am', 'going', 'to', 'delhi']

In [44]:
text = 'Lorem Ipsum is the single greatest threat. We are not - we are not keeping up with other websites. Lorem Ipsum best not make any more threats to your website. It will be met with fire and fury like the world has never seen. Does everybody know that pig named Lorem Ipsum? An ‘extremely credible source’ has called my office and told me that Barack Obama’s placeholder text is a fraud.'
re.compile('[.!?]').split(text)

['Lorem Ipsum is the single greatest threat',
 ' We are not - we are not keeping up with other websites',
 ' Lorem Ipsum best not make any more threats to your website',
 ' It will be met with fire and fury like the world has never seen',
 ' Does everybody know that pig named Lorem Ipsum',
 ' An ‘extremely credible source’ has called my office and told me that Barack Obama’s placeholder text is a fraud',
 '']

### 3. NLTK

In [45]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [46]:
sent1 = 'I am going to Delhi!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'Delhi', '!']

In [47]:
text = 'Lorem Ipsum is the single greatest threat. We are not - we are not keeping up with other websites. Lorem Ipsum best not make any more threats to your website. It will be met with fire and fury like the world has never seen. Does everybody know that pig named Lorem Ipsum? An ‘extremely credible source’ has called my office and told me that Barack Obama’s placeholder text is a fraud.'
sent_tokenize(text)

['Lorem Ipsum is the single greatest threat.',
 'We are not - we are not keeping up with other websites.',
 'Lorem Ipsum best not make any more threats to your website.',
 'It will be met with fire and fury like the world has never seen.',
 'Does everybody know that pig named Lorem Ipsum?',
 'An ‘extremely credible source’ has called my office and told me that Barack Obama’s placeholder text is a fraud.']

In [48]:
sent5 = 'I have a Ph.D in A.I'
sent6 = 'We are here to help! mail us at nks@gmail.com'
sent7 = 'A 5km  ride cost $10.50'

In [49]:
word_tokenize(sent5)

['I', 'have', 'a', 'Ph.D', 'in', 'A.I']

In [50]:
word_tokenize(sent6)

['We',
 'are',
 'here',
 'to',
 'help',
 '!',
 'mail',
 'us',
 'at',
 'nks',
 '@',
 'gmail.com']

In [51]:
word_tokenize(sent7)

['A', '5km', 'ride', 'cost', '$', '10.50']

### 4. Spacy

In [52]:
!pip install spacy



In [53]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 3.2 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [54]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [55]:
doc = nlp(sent5)
for token in doc:
    print(token)

I
have
a
Ph
.
D
in
A.I


In [56]:
doc = nlp(sent6)
for token in doc:
    print(token)

We
are
here
to
help
!
mail
us
at
nks@gmail.com


In [57]:
doc = nlp(sent7)
for token in doc:
    print(token)

A
5
km
 
ride
cost
$
10.50


In [58]:
doc = nlp(sent1)
for token in doc:
    print(token)

I
am
going
to
Delhi
!


In [59]:
doc = nlp(sent1)

## 10. Stemming

In [60]:
from nltk.stem.porter import PorterStemmer

In [61]:
ps = PorterStemmer()
def stem_words(text):
    return ' '.join(ps.stem(word) for word in text.split())


In [62]:
text = 'walk walks walking walked'
stem_words(text)

'walk walk walk walk'

In [63]:
stem_words(df['review'][6])

'i sure would like to see a resurrect of a up date seahunt seri with the tech they have today it would bring back the kid excit in mei grew up on black and white tv and seahunt with gunsmok were my hero everi weekyou have my vote for a comeback of a new sea huntw need a chang of pace in tv and thi would work for a world of under water adventureoh by the way thank you for an outlet like thi to view mani viewpoint about tv and the mani moviesso ani ole way i believ ive got what i wanna saywould be nice to read some more plu point about sea huntif my rhyme would be 10 line would you let me submitor leav me out to be in doubt and have me to quitif thi is so then i must go so let do it'

Stemming - little faster,used when output not to be shown to the user<br>
Lemmatization - slower,used when output to be shown to the user

## 11. Lemmatization

<b>Lemmatization</b> reduces inflected word properly ensuring that the root word belongs to the language.<br>
Root word is called Lemma. Lemma is canonical form, dictionary form, citation form of a set of words.

In [69]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = df['review'][200]
print('{0:20}{1:20}'.format('Word', 'Lemma'))
for word in sentence:
    print('{0:20}{1:20}'.format(word, wordnet_lemmatizer.lemmatize(word,pos='v')))

Word                Lemma               
i                   i                   
n                   n                   
t                   t                   
e                   e                   
r                   r                   
e                   e                   
s                   s                   
t                   t                   
i                   i                   
n                   n                   
g                   g                   
                                        
a                   a                   
n                   n                   
d                   d                   
                                        
s                   s                   
h                   h                   
o                   o                   
r                   r                   
t                   t                   
                                        
t                   t                   
e               