In [44]:
import pandas as pd

data = pd.read_csv("https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [45]:
data.sample(10)

Unnamed: 0,review,sentiment
33840,I thought this movie was pretty good. Some par...,positive
25384,There is no relation at all between Fortier an...,positive
31607,"A great production, that should be revived/reb...",positive
6078,I can sum this movie up using 20 words or less...,negative
7333,"A cheesy, compellingly awful (and NOT in a fun...",negative
2271,Busy Phillips put in one hell of a performance...,positive
33815,The Five Deadly Venoms is a great kung-fu acti...,positive
40604,Kurosawa really blew it on this one. Every gen...,negative
46608,I didn't know much about this movie before I w...,negative
8854,Becky Harris plays the female shopper whose mi...,negative


In [46]:
data["review"][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [47]:
# 1. Tokenization.
# 2. Lowercase. 
# 3. Uppercase. 
# 4. emojis. 
# 5. puntucations. 
# 6. html, url. 
# 7. stopwords. 
# 8. abbrevations or slang word. 
# 9. stemming and lemmetization.
# 10. spelling correction.
# 11. Whitespace. 

#### **1. Lowercase and Uppercase.**

In [48]:
data['review'].str.lower()

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

#### **2. Removings HTML tags and URL**

In [49]:
import re

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub("", text)

data = data['review'].apply(remove_html_tags)
data.head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. The filming tec...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [56]:
def remove_url(text):
    pattern=re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub("",text)

data["review"]=data["review"].apply(remove_punc)

KeyError: 'review'

#### **3. Punctuation**

In [55]:
import string

exclude = string.punctuation

def remove_punc(text):
    for char in exclude:
        text = text.replace(char, "")
    return text

In [59]:
text1="FYI this is not true"
text2="LAMO the class was so funny"
text3="I want it ASAP"

In [None]:
chat_words={
    "AFAIK":"As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP":"As Soon As Possible",
    "BTW":"By The Way",
    "B4":"Before",
    "LAMO":"Laugh My A.. Off",
    "FYI":"For your information"    
}

def chat_conversation(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

'I want it As Soon As Possible'

#### **Text Preprocessing Libraries**

1. NLTK. 
2. TextBlob.
3. Spacy.

In [64]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Collecting nltk>=3.9 (from textblob)
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m624.3/624.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hUsing cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk, textblob
  Attempting uninstall: nltk
    Found existing installation: nltk 3.8.1
    Uninstalling nltk-3.8.1:
      Successfully uninstalled nltk-3.8.1
Successfully installed nltk-3.9.1 textblob-0.19.0


In [67]:
from textblob import TextBlob

text = "here is my nae that is my sa ad he is a good mentr"
textblob = TextBlob(text)

textblob

TextBlob("here is my nae that is my sa ad he is a good mentr")

In [68]:
textblob.correct().string

'here is my name that is my sa ad he is a good entr'

In [None]:
# Another Example.
text3="I'm brav ad stong prson"
textblob = TextBlob(text3)
textblob.correct().string

"I'm brave ad strong person"

In [71]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/netrakc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [72]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [77]:
def remove_stop_words(text):
    new_text = []
    for words in text.split():
        if words in stopwords.words("english"):
            new_text.append("")
        else:
            new_text.append(words.strip())
    return " ".join(new_text)

text = "Hey, I am a netra bahadur khatri and I am preparing for the machine learning engineer roles for google and data scientist and genai engineer.Now, tell me who is netra ?"

remove_stop_words(text).replace("  ", " ")

'Hey, I  netra bahadur khatri I preparing  machine learning engineer roles google data scientist genai engineer.Now, tell  netra ?'

#### **Removing Emojis**

In [78]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [80]:
original_text = "Hello,😊 how are you today? 🌟"

In [81]:
import emoji

emoji.demojize(original_text)

'Hello,:smiling_face_with_smiling_eyes: how are you today? :glowing_star:'

In [82]:
text="""Hello, 😃💁😃💁 People
•🐻🌻 Animals
•🍔🍹 Food
•🎷⚽ Activities
•🚘🌇 Travel
•💡🎉 Objects
•💖🔣 Symbols
•🎌🏳️‍🌈 Flags"""

def remove_emoji(text):
    clean_text=emoji.demojize(text)
    return clean_text

print(remove_emoji(text))

Hello, :grinning_face_with_big_eyes::person_tipping_hand::grinning_face_with_big_eyes::person_tipping_hand: People
•:bear::sunflower: Animals
•:hamburger::tropical_drink: Food
•:saxophone::soccer_ball: Activities
•:oncoming_automobile::sunset: Travel
•:light_bulb::party_popper: Objects
•:sparkling_heart::input_symbols: Symbols
•:crossed_flags::rainbow_flag: Flags


In [None]:
emoji.is_emoji("thumbs up")

True

In [85]:
emoji.is_emoji("😃")

True

#### **Tokenization**

In [86]:
text = "I am Netra kc and working as a data scientist."

text.split()

['I', 'am', 'Netra', 'kc', 'and', 'working', 'as', 'a', 'data', 'scientist.']

In [89]:
text.split(".") # by default split takes an whitespace but now taking an dot. 

['I am Netra kc and working as a data scientist', '']

In [None]:
text = "I am netra and working as a data scientist. I live in London and working into multiple domains."
text.split(".") # This is the sentence tokenization.

['I am netra and working as a data scientist',
 ' I live in London and working into multiple domains',
 '']

In [95]:
# Using the regex, can also performs the word tokenization. 
text = "hi i am netra and living in United Kingdom"

import re

re.findall("[\w]+", text)

  re.findall("[\w]+", text)


['hi', 'i', 'am', 'netra', 'and', 'living', 'in', 'United', 'Kingdom']

In [100]:
text

'hi i am netra and living in United Kingdom'

In [98]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [None]:
word_tokenize

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/netrakc/nltk_data'
    - '/opt/anaconda3/nltk_data'
    - '/opt/anaconda3/share/nltk_data'
    - '/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [103]:
my_corpus="""Generative artificial intelligence (generative AI, genAI, GenAI, GAI or GenAI[1]) is artificial intelligence capable of generating text, images or other data using generative models,[2] often in response to prompts.[3][4] Generative AI models learn the patterns and structure of their input training data and then generate new data that has similar characteristics.[5][6]

Improvements in transformer-based deep neural networks enabled an AI boom of generative AI systems in the early 2020s. These include large language model (LLM) chatbots such as ChatGPT, Copilot, Bard, and LLaMA, and text-to-image artificial intelligence art systems such as Stable Diffusion, Midjourney, and DALL-E.[7][8][9] Companies such as OpenAI, Anthropic, Microsoft, Google, and Baidu as well as numerous smaller firms have developed generative AI models.[3][10][11]"""

In [104]:
sent_tokenize(my_corpus)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/netrakc/nltk_data'
    - '/opt/anaconda3/nltk_data'
    - '/opt/anaconda3/share/nltk_data'
    - '/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
# stemming: root form. 
# Lemmetization: original word or base form.

#### **stemming & Lemmetization**

In [108]:
from nltk.stem import PorterStemmer

input_sentence = "The quick brown foxes are jumping over the lazi dogs"

def stemming(text):
    obj = PorterStemmer()
    stem_word=[obj.stem(word) for word in text.split()]
    return stem_word

stemming(input_sentence)

['the', 'quick', 'brown', 'fox', 'are', 'jump', 'over', 'the', 'lazi', 'dog']

In [None]:
from nltk.stem import WordNetLemmatizer

text = "'Ikigai' by Hector Garcia and Francesc Miralles explores the Japanese concept of finding one's purpose in life by analyzing the habits and beliefs of the world's longest-living people. Through case studies, the book offers practical insights on how to live a more fulfilling life."

def lammatization(text):
    words=text.split()

    lemmetizer=WordNetLemmatizer()
    lemetized_word=[lemmetizer.lemmatize(word) for word in words]
    return lemetized_word

lammatization(text)

["'Ikigai'",
 'by',
 'Hector',
 'Garcia',
 'and',
 'Francesc',
 'Miralles',
 'explores',
 'the',
 'Japanese',
 'concept',
 'of',
 'finding',
 "one's",
 'purpose',
 'in',
 'life',
 'by',
 'analyzing',
 'the',
 'habit',
 'and',
 'belief',
 'of',
 'the',
 "world's",
 'longest-living',
 'people.',
 'Through',
 'case',
 'studies,',
 'the',
 'book',
 'offer',
 'practical',
 'insight',
 'on',
 'how',
 'to',
 'live',
 'a',
 'more',
 'fulfilling',
 'life.']