# Text Acquisition

In [1]:
!pip install kaggle --quiet

In [3]:
# from google.colab import files
# files.upload()  # This will prompt you to upload the kaggle.json file

In [4]:
#!/bin/bash
!kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!mkdir imdb-dataset
!unzip imdb-dataset-of-50k-movie-reviews.zip -d imdb-dataset

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 86% 22.0M/25.7M [00:00<00:00, 36.3MB/s]
100% 25.7M/25.7M [00:00<00:00, 34.4MB/s]
Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: imdb-dataset/IMDB Dataset.csv  


In [5]:
import re

import pandas as pd

import nltk

# Text Preprocessing

In [6]:
df = pd.read_csv('/content/imdb-dataset/IMDB Dataset.csv')

# | df.head(3)
#   review	                                          sentiment
# 0	One of the other reviewers has mentioned that ...	positive
# 1	A wonderful little production. <br /><br />The...	positive
# 2	I thought this was a wonderful way to spend ti...	positive

# df.shape  # (50000, 2)

In [8]:
df = df.head(100)  # DEVELOPMENT MODE

In [None]:
df["review"][5]
# 'Probably my all-time favorite movie, a story of selflessness, sacrifice and
# dedication to a noble cause, but it\'s not preachy or boring...'

### Text Noises Removal

##### Lowercasing
This standardization helps in reducing the complexity of text data by treating words with different cases (e.g., "Text" and "text")

In [10]:
df["review"] = df["review"].str.lower()

# df["review"][5]
# 'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication
# to a noble cause, but it\'s not preachy or boring...'

##### Removing punctuations

Special characters like !"#$%&'()*+,-./:;<=>?@[]^_`{|}~ that don't provide value for text analysis in many tasks (like classification).

In [13]:
import string, time
puncs = string.punctuation
puncs

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [28]:
def remove_punctuation(text):
    for char in puncs:
        text = text.replace(char, "")
    return text

In [15]:
start = time.time()

df['review'].apply(remove_punctuation)

time1 = time.time() - start
print(f'Time to remove all punctuations froms 100 texts: {time1}')

Time for remove punctuations for 100 text: 0.020401716232299805


In [29]:
text = "Hello, world! This is a test: do you like it? Yes, I do... A lot; really! How about you? @username #hashtag $dollar %percent ^caret &amp *star (parentheses) -dash_underscore+plus=equals{curly}brackets[brackets]|\backslash~tilde`backtick"
remove_punctuation(text)

'Hello world This is a test do you like it Yes I do A lot really How about you username hashtag dollar percent caret amp star parentheses dashunderscoreplusequalscurlybracketsbrackets\x08ackslashtildebacktick'

##### Removing Stop Words

* Removing stop words in NLP text processing is like cleaning up unnecessary words like "the", "is", and "and" from sentences.
* These words appear frequently in language but don't add much meaning.
* By getting rid of them, we focus more on the important words that carry the actual message, making our analysis faster and more accurate.

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
from nltk.corpus import stopwords

engSW = stopwords.words("english")

# | engSW[:5]
# ['i', 'me', 'my', 'myself', 'we']

In [21]:
def remove_stopwords(text):
    new_text=[]
    for word in text.split():
        if word in engSW:
            new_text.append('')
        else:
            new_text.append(word)

    x = new_text[:]  # Create a copy of new_text
    new_text.clear()  # Clear the original new_text list
    return " ".join(x)  # Join the copied list x into a single string separated by spaces and return it

In [22]:
start = time.time()

df['review'].apply(remove_stopwords)

time1 = time.time() - start
print(f'Time to remove all English stopwords from 100 texts.: {time1}')

Time to remove all English stopwords from 100 texts.: 0.09138917922973633


In [30]:
text = "The quick brown fox jumps over the lazy dog. In a nutshell, it's all about how you can improve your writing skills by using the right words in the right context."
remove_stopwords(text)

'The quick brown fox jumps   lazy dog. In  nutshell,       improve  writing skills  using  right words   right context.'

##### Removing URLs

* URLs often don't add meaningful information for tasks like sentiment analysis, text classification, or topic modeling.
* URLs can make the model focus on irrelevant data rather than the actual content.
* By removing them, you help the model concentrate on important words and improve the overall performance and accuracy.

In [23]:
# Function to remove URLs
def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

In [25]:
start = time.time()

df['review'] = df['review'].apply(remove_urls)

time1 = time.time() - start
print(f'Time to remove all URLs from 100 texts.: {time1}')

Time to remove all URLs from 100 texts.: 0.004606485366821289


In [31]:
text = 'Google search here www.google.com'
remove_urls(text)

'Google search here '

##### Removing HTML Tags

* HTML tags (like `<div>`, `<p>`, `<a>`, etc.) are used for formatting and structuring web content, not for conveying meaningful information.
* HTML tags don't contribute to understanding the actual content..

In [26]:
def remove_html_tags(text):
    pattern = re.compile("<.*?>")
    return pattern.sub(r"", text)

In [27]:
start = time.time()

df['review'] = df['review'].apply(remove_html_tags)

time1 = time.time() - start
print(f'Time to remove HTML tags from 100 texts.: {time1}')

Time to remove HTML tags from 100 texts.: 0.0019202232360839844


In [33]:
text = "<html><body><p> File </p><p> Author - Bob fischer</p><p> Hello world</p></body></html>"
remove_html_tags(text)

' File  Author - Bob fischer Hello world'

##### Removing numbers

* Numbers can distract the model from important words. By removing them, the model focus on the relevant linguistic content.
* However, in some cases (like analyzing financial or scientific data), numbers may be important and shouldn't be removed.

In [34]:
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

In [35]:
start = time.time()

df['review'] = df['review'].apply(remove_numbers)

time1 = time.time() - start
print(f'Time to remove HTML tags from 100 texts.: {time1}')

Time to remove HTML tags from 100 texts.: 0.00767827033996582


In [36]:
text = "In 2023, the company's revenue grew by 15%, reaching $10 million, and they hired 120 new employees across 5 offices."
remove_numbers(text)

"In , the company's revenue grew by %, reaching $ million, and they hired  new employees across  offices."

##### Abbreviations

* Abbreviations like "btw" (by the way), "idk" (I don't know), "lol" (laugh out loud) are shorthand for longer phrases.
* In NLP, these can be problematic because they may not be understood by models or could lead to confusion.

In [43]:
abbr_dictionary = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard',
    'ATM': 'At The Moment',
    'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard',
    'BBL': 'Be Back Later',
    'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now',
    'B4N': 'Bye For Now',
    'BRB': 'Be Right Back',
    'BRT': 'Be Right There',
    'BTW': 'By The Way',
    'B4': 'Before',
    'CU': 'See You',
    'CUL8R': 'See You Later',
    'CYA': 'See You',
    'FAQ': 'Frequently Asked Questions',
    'FC': 'Fingers Crossed',
    'FWIW': "For What It's Worth",
    'FYI': 'For Your Information',
    'GAL': 'Get A Life',
    'GG': 'Good Game',
    'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike',
    'GR8': 'Great!',
    'G9': 'Genius',
    'IC': 'I See',
    'ICQ': 'I Seek you (also a chat program)',
    'IDK': 'I Dont Know',
    'ILU': 'ILU: I Love You',
    'IMHO': 'In My Honest/Humble Opinion',
    'IMO': 'In My Opinion',
    'IOW': 'In Other Words',
    'IRL': 'In Real Life',
    'KISS': 'Keep It Simple, Stupid',
    'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My A.. Off',
    'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See',
    'L8R': 'Later',
    'MTE': 'My Thoughts Exactly',
    'M8': 'Mate',
    'NRN': 'No Reply Necessary',
    'OIC': 'Oh I See',
    'PITA': 'Pain In The A..',
    'PRT': 'Party',
    'PRW': 'Parents Are Watching',
    'QPSA?': 'Que Pasa?',
    'ROFL': 'Rolling On The Floor Laughing',
    'ROFLOL': 'Rolling On The Floor Laughing Out Loud',
    'ROTFLMAO': 'Rolling On The Floor Laughing My A.. Off',
    'SK8': 'Skate',
    'STATS': 'Your sex and age',
    'ASL': 'Age, Sex, Location',
    'THX': 'Thank You',
    'TTFN': 'Ta-Ta For Now!',
    'TTYL': 'Talk To You Later',
    'U': 'You',
    'U2': 'You Too',
    'U4E': 'Yours For Ever',
    'WB': 'Welcome Back',
    'WTF': 'What The F...',
    'WTG': 'Way To Go!',
    'WUF': 'Where Are You From?',
    'W8': 'Wait...',
    '7K': 'Sick:-D Laugher',
    'TFW': 'That feeling when',
    'MFW': 'My face when',
    'MRW': 'My reaction when',
    'IFYP': 'I feel your pain',
    'TNTL': 'Trying not to laugh',
    'JK': 'Just kidding',
    'IDC': "I don't care",
    'ILY': 'I love you',
    'IMU': 'I miss you',
    'ADIH': 'Another day in hell',
    'ZZZ': 'Sleeping, bored, tired',
    'WYWH': 'Wish you were here',
    'TIME': 'Tears in my eyes',
    'BAE': 'Before anyone else',
    'FIMH': 'Forever in my heart',
    'BSAAW': 'Big smile and a wink',
    'BWL': 'Bursting with laughter',
    'BFF': 'Best friends forever',
    'CSL': "Can't stop laughing"
}

In [38]:
def short_conv(text):
    new_text = []
    for w in text.split():
        if w.upper() in abbr_dictionary:
            new_text.append(abbr_dictionary[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [39]:
start = time.time()

df['review'] = df['review'].apply(short_conv)

time1 = time.time() - start
print(f'Time to handle all abbreviations from 100 texts.: {time1}')

Time to handle all abbreviations from 100 texts.: 0.01204824447631836


In [46]:
text = "btw, I didn't get your message. lol, idk what happened!"
text = remove_punctuation(text)
short_conv(text)

'By The Way I didnt get your message Laughing Out Loud I Dont Know what happened'

##### Removing emojies

In [47]:
# Removing Emojis
def remove_emoji(text):
    emoji_pattern=re.compile("["
                             u"\U0001F600-\U0001F64F" #emoticons
                             u"\U0001F300-\U0001F5FF" #symbols, pictograph
                              u"\U0001F680-\U0001F6FF" #transport and map symbol
                              u"\U0001F1E0-\U0001F1FF" #flags(IOS)
                              u"\U00002702-\U000027B0"
                              u"\U00002FC2-\U0001F251"
                             "]+",flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)

In [48]:
start = time.time()

df['review'] = df['review'].apply(remove_emoji)

time1 = time.time() - start
print(f'Time to remove all emojies from 100 texts.: {time1}')

Time to remove all emojies from 100 texts.: 0.01631331443786621


In [49]:
text = "I'm so excited for the party tonight! 🎉 Can't wait to see everyone there! 😄"
remove_emoji(text)

"I'm so excited for the party tonight!  Can't wait to see everyone there! "