In [3]:
import pandas as pd

df=pd.read_csv('../data/IMDB Dataset.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df.shape

(50000, 2)

In [7]:
# firts processing by make all to become lowercase
df['review']=df['review'].str.lower()

In [8]:
# processing by remove html tag
import re   #regex
def remove_html_tag(text):
    pattern = re.compile('<.*?>')
    return pattern.sub('', text)

In [9]:
text_cases = [
    "<h1>Hello, world!</h1>",
    "<p>This is <b>bold</b> and <i>italic</i> text.</p>",
    '<a href="https://example.com">Visit</a> <img src="image.jpg" />',
    "<script>alert('XSS');</script>Welcome",
    "<div><p>Test <span>nested</span> tags</p></div>",
]

for i, text in enumerate(text_cases, 1):
    print(f"Text {i} : {remove_html_tag(text)}")

Text 1 : Hello, world!
Text 2 : This is bold and italic text.
Text 3 : Visit 
Text 4 : alert('XSS');Welcome
Text 5 : Test nested tags


In [10]:
df['review'][3]
df['review']=df['review'].apply(remove_html_tag)
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [11]:
# processing by clearing a url
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text)

In [12]:
test_cases = [
    {
        "input": "Visit our website at https://example.com for more info.",
        "expected": "Visit our website at for more info."
    },
    {
        "input": "Check out www.google.com and let us know.",
        "expected": "Check out and let us know."
    },
    {
        "input": "No URLs in this sentence.",
        "expected": "No URLs in this sentence."
    },
    {
        "input": "Multiple links: https://a.com https://b.org",
        "expected": "Multiple links:"
    },
    {
        "input": "Start with URL: http://start.com and end with www.end.com",
        "expected": "Start with URL: and end with"
    }
]

for i, case in enumerate(test_cases, 1):
    result = remove_urls(case["input"])
    status = "‚úÖ PASSED" if result == case["expected"] else f"‚ùå FAILED\nExpected: {case['expected']}\nGot: {result}"
    print(f"Test {i}: {status}")


Test 1: ‚ùå FAILED
Expected: Visit our website at for more info.
Got: Visit our website at  for more info.
Test 2: ‚ùå FAILED
Expected: Check out and let us know.
Got: Check out  and let us know.
Test 3: ‚úÖ PASSED
Test 4: ‚ùå FAILED
Expected: Multiple links:
Got: Multiple links:  
Test 5: ‚ùå FAILED
Expected: Start with URL: and end with
Got: Start with URL:  and end with 


In [13]:
# processing by deleting a spesial char like !"#$%&'()*+,-./:;<=>?@[\]^_{|}~
import string

def remove_punctuation(text):
    return re.sub(f"[{re.escape(string.punctuation)}]", "", text)


In [14]:
df['review'][2]

'i thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. the plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). while some may be disappointed when they realize this is not match point 2: risk addiction, i thought it was proof that woody allen is still fully in control of the style many of us have grown to love.this was the most i\'d laughed at one of woody\'s comedies in years (dare i say a decade?). while i\'ve never been impressed with scarlet johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.this may not be the crown jewel of his career, but it was wittier than "devil wears prada" and more interesting than "superman" a great comedy to go see with friends.'

In [15]:
clean_text=remove_punctuation(df['review'][2])
clean_text

'i thought this was a wonderful way to spend time on a too hot summer weekend sitting in the air conditioned theater and watching a lighthearted comedy the plot is simplistic but the dialogue is witty and the characters are likable even the well bread suspected serial killer while some may be disappointed when they realize this is not match point 2 risk addiction i thought it was proof that woody allen is still fully in control of the style many of us have grown to lovethis was the most id laughed at one of woodys comedies in years dare i say a decade while ive never been impressed with scarlet johanson in this she managed to tone down her sexy image and jumped right into a average but spirited young womanthis may not be the crown jewel of his career but it was wittier than devil wears prada and more interesting than superman a great comedy to go see with friends'

In [16]:
df['review']=df['review'].apply(remove_punctuation)

In [17]:
# processing chat word
chat_words = {
    'AFAIK':'As Far As I Know',
    'AFK':'Away From Keyboard',
    'ASAP':'As Soon As Possible'
}


{
    "FYI": "For Your Information",
    "ASAP": "As Soon As Possible",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "LOL": "Laugh Out Loud",
    "TTYL": "Talk To You Later",
    "GTG": "Got To Go",
    "TTYT": "Talk To You Tomorrow",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "IMHO": "In My Humble Opinion",
    "ICYMI": "In Case You Missed It",
    "AFAIK": "As Far As I Know",
    "BTW": "By The Way",
    "FAQ": "Frequently Asked Questions",
    "TGIF": "Thank God It's Friday",
    "FYA": "For Your Action",
    "ICYMI": "In Case You Missed It",
}

{'FYI': 'For Your Information',
 'ASAP': 'As Soon As Possible',
 'BRB': 'Be Right Back',
 'BTW': 'By The Way',
 'OMG': 'Oh My God',
 'IMO': 'In My Opinion',
 'LOL': 'Laugh Out Loud',
 'TTYL': 'Talk To You Later',
 'GTG': 'Got To Go',
 'TTYT': 'Talk To You Tomorrow',
 'IDK': "I Don't Know",
 'TMI': 'Too Much Information',
 'IMHO': 'In My Humble Opinion',
 'ICYMI': 'In Case You Missed It',
 'AFAIK': 'As Far As I Know',
 'FAQ': 'Frequently Asked Questions',
 'TGIF': "Thank God It's Friday",
 'FYA': 'For Your Action'}

In [18]:
# processing text to normalization text
def chat_confersion(text):
    new_text=[]

    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return ' '.join(new_text)

In [19]:
chat_confersion('Do This work ASAP')

'Do This work As Soon As Possible'

In [20]:
from textblob import TextBlob

def incorrect_text(text):
    blob = TextBlob(text)
    corrected = blob.correct()
    return str(corrected)


In [21]:
print(incorrect_text("I havv goood speling"))
# Output: "I have good spelling"

I have good spelling


In [22]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

def remove_stopwords(text, lang='english'):
    stop_words = set(stopwords.words(lang))
    return ' '.join([word for word in text.split() if word not in stop_words])

[nltk_data] Downloading package stopwords to /home/taqin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
# remove stop word 
df['review']=df['review'].apply(remove_stopwords)

In [24]:
# processing by removng emoji
import re

def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emotikon wajah
        "\U0001F300-\U0001F5FF"  # simbol & ikon
        "\U0001F680-\U0001F6FF"  # transportasi & simbol lainnya
        "\U0001F1E0-\U0001F1FF"  # bendera negara
        "\U00002700-\U000027BF"  # simbol tambahan
        "\U0001F900-\U0001F9FF"  # emoji tambahan
        "\U00002600-\U000026FF"  # simbol cuaca dll
        "\U00002B50-\U00002B55"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)



In [None]:
text = "Halo üåü, ini adalah contoh teks üòä dengan emoji üöÄ!"
cleaned_text = remove_emoji(text)
print(cleaned_text)

Halo , ini adalah contoh teks  dengan emoji !


In [30]:
# processing by get emoji
import emoji


def get_emoji(text):
    arr_emoji=[]
    for char in text:
        if emoji.is_emoji(char):
            arr_emoji.append(char)
    return arr_emoji

In [31]:
text = "Halo üåü, ini adalah contoh teks üòä dengan emoji üöÄ!"
get_emoji(text)

['üåü', 'üòä', 'üöÄ']

In [None]:
# processing text by tokenization or splitting

import re
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

# Contoh teks
text = "Halo! Ini adalah contoh kalimat, dengan berbagai tanda baca. Apakah kamu siap? üí°üöÄ"

# 1. Tokenisasi dengan split()
tokens_split = text.split()

# 2. Tokenisasi dengan re (mengambil hanya kata tanpa tanda baca)
tokens_re = re.findall(r'\b\w+\b', text)

# 3. Tokenisasi dengan nltk
tokens_nltk = word_tokenize(text)

# Output hasil
print("=== Token dengan split() ===")
print(tokens_split)

print("\n=== Token dengan regex (re) ===")
print(tokens_re)

print("\n=== Token dengan NLTK ===")
print(tokens_nltk)


=== Token dengan split() ===
['Halo!', 'Ini', 'adalah', 'contoh', 'kalimat,', 'dengan', 'berbagai', 'tanda', 'baca.', 'Apakah', 'kamu', 'siap?', 'üí°üöÄ']

=== Token dengan regex (re) ===
['Halo', 'Ini', 'adalah', 'contoh', 'kalimat', 'dengan', 'berbagai', 'tanda', 'baca', 'Apakah', 'kamu', 'siap']

=== Token dengan NLTK ===
['Halo', '!', 'Ini', 'adalah', 'contoh', 'kalimat', ',', 'dengan', 'berbagai', 'tanda', 'baca', '.', 'Apakah', 'kamu', 'siap', '?', 'üí°üöÄ']


[nltk_data] Downloading package punkt to /home/taqin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# processing by change text to become baseline text
from nltk.stem import PorterStemmer

# Inisialisasi stemmer
stemmer = PorterStemmer()

# Contoh penggunaan
words = ["running", "flies", "easily", "fairly"]
stemmed_words = [stemmer.stem(word) for word in words]

print(stemmed_words)


['run', 'fli', 'easili', 'fairli']


In [39]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# inisiasi
word_lemmatizer=WordNetLemmatizer()


def clean_and_lemmatize(text):
    # tokenisasi
    tokens = word_tokenize(text)

    # remove spesial char and change it to lowercase
    tokens = [word.lower() for word in tokens if word not in string.punctuation]

    # lemmatize token
    lemmatize_tokens = [word_lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(lemmatize_tokens)


[nltk_data] Downloading package punkt to /home/taqin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/taqin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/taqin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [40]:
text = "The cats were running quickly, but they weren't faster than the dog."

cleaned = clean_and_lemmatize(text)

print(cleaned)


the cat were running quickly but they were n't faster than the dog


In [None]:
# lemmatazion with improvemnet using post tagging for detect adjective word

import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer=WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def clean_and_lemmatize_v2(text):
    tokens=word_tokenize(text)

    # remove punctuation
    tokens=[word.lower() for word in tokens if word not in string.punctuation]

    # pos tagging
    post_tags=nltk.pos_tag(tokens)

    # lemmatize with post tagging

    lemma_text=[
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in post_tags
    ]

    return " ".join(lemma_text)

[nltk_data] Downloading package punkt to /home/taqin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/taqin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/taqin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/taqin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
text = "The cats were running quickly, but they weren't faster than the dog that ate the food."

print(clean_and_lemmatize_v2(text))
