<a href="https://colab.research.google.com/github/mansigambhir-13/100-Days-of-deep-Learning/blob/main/Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the IMDB dataset
# The dataset is in a CSV file named 'imdb_reviews.csv' with columns 'review' and 'sentiment'
df = pd.read_csv('/content/IMDB Dataset.csv')

# Function to clean the text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Function to remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = word_tokenize(text)
    return ' '.join([word for word in words if word not in stop_words])

# Initialize Porter Stemmer
stemmer = PorterStemmer()

# Function to perform stemming
def stem_words(text):
    words = word_tokenize(text)
    return ' '.join([stemmer.stem(word) for word in words])

# Apply preprocessing steps
df['cleaned_review'] = df['review'].apply(clean_text)
df['no_stopwords'] = df['cleaned_review'].apply(remove_stopwords)
df['stemmed'] = df['no_stopwords'].apply(stem_words)

# Display the first few rows to check the results
print(df[['review', 'cleaned_review', 'no_stopwords', 'stemmed']].head())

# Save the preprocessed data
df.to_csv('preprocessed_imdb_reviews.csv', index=False)

print("Preprocessing complete. Preprocessed data saved to 'preprocessed_imdb_reviews.csv'")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production the filming tech...   
2  i thought this was a wonderful way to spend ti...   
3  basically theres a family where a little boy j...   
4  petter matteis love in the time of money is a ...   

                                        no_stopwords  \
0  one reviewers mentioned watching oz episode yo...   
1  wonderful little production filming technique ...   
2  thought wonderful way spend time hot summer we...   
3  basically theres family little boy jake thinks...   
4  petter matteis love time money visually stu

#removing the chat words from the dataset

In [18]:
chat_words = {
    'AFAIK': 'as far as I know',
    'AMA': 'ask me anything',
    'ASAP': 'as soon as possible',
    'BRB': 'be right back',
    'BTW': 'by the way',
    'FYI': 'for your information',
    'IIRC': 'if I remember correctly',
    'IMO': 'in my opinion',
    'IMHO': 'in my humble opinion',
    'LOL': 'laugh out loud',
    'ROFL': 'rolling on the floor laughing',
    'TBH': 'to be honest',
    'TLDR': 'too long didn\'t read',
    'TTYL': 'talk to you later',
    'WTF': 'what the fuck',
    'YOLO': 'you only live once'
}

In [19]:
def remove_chat_words(text):
    words = text.split()
    return ' '.join([chat_words.get(word.upper(), word) for word in words])

This approach will replace common chat acronyms with their full-text equivalents, making the text more standardized and easier to process. You can expand the chat_words dictionary with more acronyms as needed for your specific dataset.

#Spelling checking and correction to the dataset

In [20]:
pip install pyspellchecker



In [21]:
from spellchecker import SpellChecker

In [22]:
# Initialize SpellChecker
spell = SpellChecker()

In [23]:
def correct_spelling(text):
    words = text.split()
    corrected_words = [spell.correction(word) or word for word in words]
    return ' '.join(corrected_words)

#Tokenization of the text

In [24]:
def word_tokenize_text(self, text):
        """Perform word-level tokenization"""
        tokens = self.word_tokenizer.tokenize(text)
        tokens = [token for token in tokens if token not in self.stop_words]
        return tokens

In [25]:
 def subword_tokenize_text(self, text):
        """Perform subword tokenization using BERT tokenizer"""
        tokens = self.bert_tokenizer.tokenize(text)
        tokens = [token for token in tokens if token not in self.stop_words]
        return tokens

In [26]:
def sentence_tokenize_text(self, text):
        """Perform sentence-level tokenization"""
        return sent_tokenize(text)
def process_chunk(self, texts):
        """Process a chunk of texts"""
        results = []
        for text in texts:
            preprocessed_text = self.preprocess_text(text)

            if self.tokenization_method == 'word':
                tokens = self.word_tokenize_text(preprocessed_text)
            elif self.tokenization_method == 'subword':
                tokens = self.subword_tokenize_text(preprocessed_text)
            elif self.tokenization_method == 'sentence':
                tokens = self.sentence_tokenize_text(preprocessed_text)
            else:
                raise ValueError(f"Unknown tokenization method: {self.tokenization_method}")

            results.append(tokens)
        return results

In [27]:
def tokenize_dataset(self, texts):
        """Tokenize the entire dataset using multiprocessing"""
        # Split texts into chunks for parallel processing
        chunk_size = len(texts) // self.num_processes
        chunks = [texts[i:i + chunk_size] for i in range(0, len(texts), chunk_size)]

        # Process chunks in parallel
        with mp.Pool(self.num_processes) as pool:
            results = list(tqdm(pool.imap(self.process_chunk, chunks),
                              total=len(chunks),
                              desc="Tokenizing texts"))

        # Flatten results
        return [token for chunk_result in results for token in chunk_result]


In [32]:


class IMDBTokenizer:
    def __init__(self, tokenization_method='word', num_processes=None):
        self.tokenization_method = tokenization_method
        self.num_processes = num_processes or mp.cpu_count()

        # Download NLTK resources if not already downloaded
        import nltk
        nltk.download('punkt')
        nltk.download('stopwords')

        # Initialize tokenizers and stop words
        self.word_tokenizer = word_tokenize
        self.stop_words = set(stopwords.words('english'))
        # self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Initialize if using subword tokenization

    def preprocess_text(self, text):
        """Perform text preprocessing (e.g., lowercasing, removing punctuation)"""
        text = text.lower()
        # Add more preprocessing steps as needed
        return text

    def word_tokenize_text(self, text):
        """Perform word-level tokenization"""
        tokens = self.word_tokenizer(text)
        tokens = [token for token in tokens if token not in self.stop_words]
        return tokens

    def subword_tokenize_text(self, text):
        """Perform subword tokenization using BERT tokenizer"""
        tokens = self.bert_tokenizer.tokenize(text)
        tokens = [token for token in tokens if token not in self.stop_words]
        return tokens

    def sentence_tokenize_text(self, text):
        """Perform sentence-level tokenization"""
        return sent_tokenize(text)

    def process_chunk(self, texts):
        """Process a chunk of texts"""
        results = []
        for text in texts:
            preprocessed_text = self.preprocess_text(text)

            if self.tokenization_method == 'word':
                tokens = self.word_tokenize_text(preprocessed_text)
            elif self.tokenization_method == 'subword':
                tokens = self.subword_tokenize_text(preprocessed_text)
            elif self.tokenization_method == 'sentence':
                tokens = self.sentence_tokenize_text(preprocessed_text)
            else:
                raise ValueError(f"Unknown tokenization method: {self.tokenization_method}")

            results.append(tokens)
        return results

    def tokenize_dataset(self, texts):
        """Tokenize the entire dataset using multiprocessing"""
        # Split texts into chunks for parallel processing
        chunk_size = len(texts) // self.num_processes
        chunks = [texts[i:i + chunk_size] for i in range(0, len(texts), chunk_size)]

        # Process chunks in parallel
        with mp.Pool(self.num_processes) as pool:
            results = list(tqdm(pool.imap(self.process_chunk, chunks),
                              total=len(chunks),
                              desc="Tokenizing texts"))

        # Flatten results
        return [token for chunk_result in results for token in chunk_result]


def main():
    # Load the IMDB dataset
    #df = pd.read_csv('/content/IMDB Dataset.csv.zip')
    #The above line is causing the error, it should be replaced with:
    df = pd.read_csv('IMDB Dataset.csv') # Changed the file path

    # Initialize tokenizer
    tokenizer = IMDBTokenizer(tokenization_method='word', num_processes=4)

    # Tokenize the

#stemming to the dataset

In [35]:

#APPLICATION OF SNOWBALL STEMMER
def preprocess_reviews(df, text_column):

    # Download required NLTK data
    try:
        nltk.download('punkt')
        nltk.download('stopwords')
    except:
        print("NLTK data already downloaded")

    # Initialize the Snowball stemmer (English by default)
    stemmer = SnowballStemmer('english')
    stop_words = set(stopwords.words('english'))



Applying porter stemmer

In [36]:
from nltk.stem.porter import PorterStemmer

In [38]:
ps=PorterStemmer()
def stem_words(text):
  return " ".join([ps.stem(word) for word in text.split()])
  print(text)

#Lemmatization

In [44]:
#without pos tagging
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def simple_lemmatization(df, text_column):

    try:
        nltk.download('wordnet')
    except:
        print("NLTK data already downloaded")

    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    def lemmatize_text(text):
        """
        Lemmatize text without POS tagging
        """
        # Split text into words
        words = text.split()

        # Apply basic lemmatization to each word
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

        # Join words back into text
        return ' '.join(lemmatized_words)

    # Create a copy of the dataframe
    lemmatized_df = df.copy()

    # Apply lemmatization to the text column
    lemmatized_df['lemmatized_text'] = lemmatized_df[text_column].apply(lemmatize_text)

    return lemmatized_df
