In [None]:
import pandas as pd
import textwrap

In [3]:
def print_wrapped(text, max_cols=80):
    wrapped = textwrap.fill(text, width=max_cols)
    print(wrapped)

In [16]:
data = pd.read_csv("./../DATA/IMDB Dataset.csv")

In [5]:
# shape (m,n)
data.shape

(50000, 2)

In [None]:
# display basic info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
# peek
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [17]:
# peek one review
print_wrapped(data["review"][0])

One of the other reviewers has mentioned that after watching just 1 Oz episode
you'll be hooked. They are right, as this is exactly what happened with me.<br
/><br />The first thing that struck me about Oz was its brutality and
unflinching scenes of violence, which set in right from the word GO. Trust me,
this is not a show for the faint hearted or timid. This show pulls no punches
with regards to drugs, sex or violence. Its is hardcore, in the classic use of
the word.<br /><br />It is called OZ as that is the nickname given to the Oswald
Maximum Security State Penitentary. It focuses mainly on Emerald City, an
experimental section of the prison where all the cells have glass fronts and
face inwards, so privacy is not high on the agenda. Em City is home to
many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and
more....so scuffles, death stares, dodgy dealings and shady agreements are never
far away.<br /><br />I would say the main appeal of the show is due to the fac

In [18]:
# make review col lower case
data["review"] = data.review.str.lower()

In [19]:
# peek one review
data.head(3)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive


## remove html tags

In [20]:
import re 

def remove_html(text):
    text = re.sub(r"<.*?>", "", text)
    return text

In [None]:
print_wrapped(data.review[1])

a wonderful little production. <br /><br />the filming technique is very
unassuming- very old-time-bbc fashion and gives a comforting, and sometimes
discomforting, sense of realism to the entire piece. <br /><br />the actors are
extremely well chosen- michael sheen not only "has got all the polari" but he
has all the voices down pat too! you can truly see the seamless editing guided
by the references to williams' diary entries, not only is it well worth the
watching but it is a terrificly written and performed piece. a masterful
production about one of the great master's of comedy and his life. <br /><br
/>the realism really comes home with the little things: the fantasy of the guard
which, rather than use the traditional 'dream' techniques remains solid then
disappears. it plays on our knowledge and our senses, particularly with the
scenes concerning orton and halliwell and the sets (particularly of their flat
with halliwell's murals decorating every surface) are terribly well done.


In [33]:
data.review = data.review.apply(remove_html)

In [34]:
print_wrapped(data.review[1])

a wonderful little production. the filming technique is very unassuming- very
old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense
of realism to the entire piece. the actors are extremely well chosen- michael
sheen not only "has got all the polari" but he has all the voices down pat too!
you can truly see the seamless editing guided by the references to williams'
diary entries, not only is it well worth the watching but it is a terrificly
written and performed piece. a masterful production about one of the great
master's of comedy and his life. the realism really comes home with the little
things: the fantasy of the guard which, rather than use the traditional 'dream'
techniques remains solid then disappears. it plays on our knowledge and our
senses, particularly with the scenes concerning orton and halliwell and the sets
(particularly of their flat with halliwell's murals decorating every surface)
are terribly well done.


## remove url

In [40]:
def remove_url(text):
    text = re.sub(r"http\S+|www\.\S+", "", text).strip()
    return text

In [41]:
text1 = "Check out my youtube https://www.youtube.com/"
text1

'Check out my youtube https://www.youtube.com/'

In [42]:
remove_url(text1)

'Check out my youtube'

In [43]:
data.review = data.review.apply(remove_url)

## remove punctuation

In [45]:
import string

exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [46]:
def remove_punctuation(text: str):
    return text.translate(str.maketrans("", "", exclude))

In [47]:
data.review = data.review.apply(remove_punctuation)

In [48]:
data.head(2)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive


## replace chat abbreviations

In [50]:
chat_words = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'BAE': 'Before Anyone Else',
    'BF': 'Boyfriend',
    'BFF': 'Best Friends Forever',
    'BRB': 'Be Right Back',
    'BTW': 'By The Way',
    'CEO': 'Used humorously, like "CEO of sleeping late"',
    'DM': 'Direct Message',
    'FAQ': 'Frequently Asked Questions',
    'FOMO': 'Fear Of Missing Out',
    'FYA': 'For Your Action',
    'FYI': 'For Your Information',
    'GF': 'Girlfriend',
    'GG': 'Good Game',
    'GLHF': 'Good Luck, Have Fun',
    'GOAT': 'Greatest Of All Time',
    'GTG': 'Got To Go',
    'HBU': 'How About You?',
    'ICYMI': 'In Case You Missed It',
    'IDC': "I Don't Care",
    'IDGAF': "I Don't Give A F***",
    'IDK': "I Don't Know",
    'ILY': 'I Love You',
    'IMHO': 'In My Humble Opinion',
    'IMY': 'I Miss You',
    'IMO': 'In My Opinion',
    'LFG': "Let's F***ing Go",
    'LMAO': 'Laughing My Ass Off',
    'LOL': 'Laugh Out Loud',
    'NOOB': 'Newbie / Beginner',
    'NVM': 'Never Mind',
    'NSFW': 'Not Safe For Work',
    'OOMF': 'One Of My Followers',
    'OMG': 'Oh My God',
    'OTP': 'One True Pairing',
    'POV': 'Point Of View',
    'RIZZ': 'Charisma / Ability to flirt (slang)',
    'RN': 'Right Now',
    'ROFL': 'Rolling On the Floor Laughing',
    'SMH': 'Shaking My Head',
    'SUS': 'Suspicious',
    'TBH': 'To Be Honest',
    'TBF': 'To Be Fair',
    'TGIF': "Thank God It's Friday",
    'TMI': 'Too Much Information',
    'TT': 'TikTok',
    'TTYL': 'Talk To You Later',
    'TTYT': 'Talk To You Tomorrow',
    'WTF': 'What The F***',
    'WYD': 'What You Doing?',
    'YOLO': 'You Only Live Once'
}


In [51]:
def abreviation_to_word(text):
    new_text = []
    for word in text.split():
        val = word.upper()
        new_text.append(chat_words.get(val, word))
    return " ".join(new_text)

In [55]:
abreviation_to_word('I need help ASAP')

'I need help As Soon As Possible'

## spelling correction 

In [57]:
from textblob import TextBlob

In [58]:
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'

In [67]:
TextBlob(incorrect_text).correct().string

'certain conditions during several generations are modified in the same manner.'

In [69]:
import nltk
from nltk.corpus import stopwords

In [71]:
eng_stopwords = stopwords.words("english")
eng_stopwords[:len(eng_stopwords):25], len(eng_stopwords)

(['a', 'but', "hadn't", 'in', 'needn', 'she', "they'll", 'when'], 198)

In [72]:
fr_stopwords = stopwords.words("french")
fr_stopwords[:len(fr_stopwords):25], len(fr_stopwords)

(['au', 'mes', 'tu', 'es', 'furent', 'aurai', 'aient'], 157)

In [73]:
def remove_stopwords(text: str):
    text = " ".join([word for word in text.split() if word not in eng_stopwords])
    return text

In [82]:
sent = "probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times"
f"len : {len(sent)}"

'len : 208'

In [80]:
print_wrapped(remove_stopwords(sent))
print("len : ", len(remove_stopwords(sent)))

probably all-time favorite movie, story selflessness, sacrifice dedication noble
cause, preachy boring. never gets old, despite seen 15 times
len :  141


In [83]:
data.review = data.review.apply(remove_stopwords)

In [85]:
data.head(2)

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive


In [86]:
import emoji

In [89]:
def remove_emoji(text: str):
    return emoji.replace_emoji(text, replace="").strip()

In [90]:
remove_emoji("Python is 🔥")

'Python is'

In [None]:
# extract emoji meaning 
emoji.demojize("Loved the movie. It was 😘")

'Loved the movie. It was :face_blowing_a_kiss:'

## Tokenization
### 1. split function

In [102]:
# word tokens
"I am going to Paris!".split()

['I', 'am', 'going', 'to', 'Paris!']

In [None]:
# sentence tokens
"I am going to Rome. I will stay there for 3 days. Let's hope the trip to be great".split(".")

['I am going to Rome',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

In [97]:
# limit with split function, punctuation is part of a word
print("I am going to London!".split())
print("Where do you think I should go? I have 3 day holiday.".split("."))

['I', 'am', 'going', 'to', 'London!']
['Where do you think I should go? I have 3 day holiday', '']


### 2. Regular Expression

In [None]:
# word tokens
sent = "I am going to Paris!"
tokens = re.findall(r"[\w']+", sent)
tokens

['I', 'am', 'going', 'to', 'Paris']

In [99]:
# sentence tokens
sent = "Where do you think I should go? I have 3 day holiday."
tokens = re.compile(r'[.!?]\s*').split(sent)
tokens

['Where do you think I should go', 'I have 3 day holiday', '']

### 3. NLTK

In [104]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
# punctuation not part of the word 
word_tokenize( "I am going to Paris!")

['I', 'am', 'going', 'to', 'Paris', '!']

In [None]:
# punctuation ends a sentence
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sent_tokenize(text)

['Lorem Ipsum is simply dummy text of the printing and typesetting industry?',
 "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,\nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

### 4. Spacy

- pip install -U spacy

Download statistical models :
- English
    - python -m spacy download en_core_web_sm
    - python -m spacy download en_core_web_md
- French
    - python -m spacy download fr_core_news_sm
    - python -m spacy download fr_core_news_md

In [112]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [115]:
# word token
text = "I am going to Paris. It's a beautiful city!"
doc = nlp(text)
word_token = [token.text for token in doc]
print(word_token)

['I', 'am', 'going', 'to', 'Paris', '.', 'It', "'s", 'a', 'beautiful', 'city', '!']


In [116]:
# sentence token 
sentence_token = [token.text for token in doc.sents]
print(sentence_token)

['I am going to Paris.', "It's a beautiful city!"]


## Stemmer

In [119]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [None]:
# get word to root form (not optimal, may invent a word)
def stem_word(text: str):
    return " ".join([ps.stem(word) for word in text.split()])

In [121]:
sample = "walk walks walking walked"
stem_word(sample)

'walk walk walk walk'

In [132]:
text = """probably my alltime favorite movie a story of selflessness sacrifice and dedication
to a noble cause but its not preachy or boring it just never gets old despite my having seen it 
some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and 
bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma 
says more like dressedup midgets than children but that only makes them more fun to watch and 
the mothers slow awakening to whats happening in the world and under her own roof is believable 
and startling if i had a dozen thumbs theyd all be up for this movie"""

print(f"len text : {len(text)}")
print()
print_wrapped(stem_word(text))
print()
print(f"len stem text : {len(stem_word(text))}")

len text : 633

probabl my alltim favorit movi a stori of selfless sacrific and dedic to a nobl
caus but it not preachi or bore it just never get old despit my have seen it
some 15 or more time in the last 25 year paul luka perform bring tear to my eye
and bett davi in one of her veri few truli sympathet role is a delight the kid
are as grandma say more like dressedup midget than children but that onli make
them more fun to watch and the mother slow awaken to what happen in the world
and under her own roof is believ and startl if i had a dozen thumb theyd all be
up for thi movi

len stem text : 568


## Lemmatization
- better than stemming, root word always exist

In [133]:
from nltk.stem.wordnet import WordNetLemmatizer
# nltk.download("wordnet")
# nltk.download("omw-1.4")

In [140]:
lemmatizer = WordNetLemmatizer()

In [142]:
text = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."

In [145]:
words = word_tokenize(text)
words = [word for word in words if word not in eng_stopwords]
words = [word for word in words if word not in exclude]

In [146]:
print("{0:20}{1:20}".format("Word :", "Lemma :"))
print()
for word in words:
    print("{0:20}{1:20}".format(word, lemmatizer.lemmatize(word, pos='v')))

Word :              Lemma :             

He                  He                  
running             run                 
eating              eat                 
time                time                
He                  He                  
bad                 bad                 
habit               habit               
swimming            swim                
playing             play                
long                long                
hours               hours               
Sun                 Sun                 
