# Cleaning Text

An important task in Natural Language Processing (NLP) involves text data cleaning. To optimize results, it's important to convert your text to its essential root words within the corpus while removing irrelevant symbols. 

- Converting words into lowercase
- Removing leading and trailing whitespace
- Removing punctuation
- Removing stopwords
- Removing special characters (numbers, emojis, etc.)

In [14]:
import time
import re
import numpy as np 
import pandas as pd

from nltk import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords
import spacy 
nlp = spacy.load('en_core_web_sm')

## Load data

In [3]:
merge_df = pd.read_csv('../data/merge_df2.csv', low_memory = True, index_col = False)
# Remove null values in 'text' column 
merge_df = merge_df.dropna(subset=['text'])
# Remove rows where 'text' column contains empty strings
merge_df = merge_df[merge_df['text'].str.strip() != '']
# Reset the index of the DataFrame
merge_df.reset_index(drop=True, inplace=True)

print(f'Number of records: {len(merge_df)}')

  merge_df = pd.read_csv('../data/merge_df2.csv', low_memory = True, index_col = False)


Number of records: 2556296


In [5]:
random_row = merge_df.sample(2)
print(f'Rating: {random_row["rating"].iloc[0]}\n')
print(f'Text: {random_row["text"].iloc[0]}\n')
# print(f'Tokenized Sentences: {random_row["tokenized_sentences"].iloc[0]}\n')
print('----------')
print(f'Rating: {random_row["rating"].iloc[1]}\n')
print(f'Text: {random_row["text"].iloc[1]}\n')
# print(f'Tokenized Sentences: {random_row["tokenized_sentences"].iloc[1]}\n')

Rating: 5

Text: Just follow directions. Easy to get onto bike chain without actual tools. Nice.

----------
Rating: 4

Text: I bought this shower tent to inspire my 13-year old to read in a private space.  The door and “window” zipper really work well and add a lot of privacy.  I am really surprised by how much my kid loves it.  The tent is pretty spacious and can fit a beanbag chair along with some books.  I recommend it just for the novelty of it.  ;)



## Text Cleaning 1: Remove noise

In [7]:
def regex_clean(text):
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # Remove all punctuation except periods and apostrophes
    text = re.sub(r"[^\w\s'.!]", '', text)
    # Replace any sequence of periods longer than one with a single period
    text = re.sub(r'\.{2,}', '. ', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Clean the text
merge_df['text'] = merge_df['text'].apply(regex_clean)
print(f'Record count: {len(merge_df)}')

Record count: 2556296


## Tokenize sentences

In [15]:
# Tokenize sentences and create a new column
merge_df['tokenized_sentences'] = merge_df['text'].apply(sent_tokenize)

## Text Cleaning 2: Remove stopwords and lemmatize words

In [None]:
# Define stopwords
stopwords = spacy.lang.en.stop_words.STOP_WORDS
include_stopwords = {'would', 'I'}
stopwords |= include_stopwords
print('Original stopwords count:', len(stopwords))

def clean_data(tokenized_sentences):
    """
    Cleans and preprocesses tokenized sentences using spaCy.
    
    This function takes a list of tokenized sentences as input, converts each sentence to lowercase, 
    lemmatizes the words, and filters out stopwords. 
    The resulting cleaned tokenized sentences are returned as a list of lists. 
    """
    cleaned_sentences = []
    
    for sentence in tokenized_sentences:
        # Convert each tokenized sentence to lowercase
        sentence = sentence.lower()
        # Process the tokenized sentence with spaCy
        doc = nlp(sentence)
        # Lemmatize words and filter out stopwords
        tokens = [token.lemma_ for token in doc if not token.is_stop]
        cleaned_text = " ".join(tokens)        
        cleaned_sentences.append(cleaned_text)
    
    return cleaned_sentences

# Apply the clean_data function to the tokenized_sentences column
merge_df['clean_tokenized_sentences'] = merge_df['tokenized_sentences'].apply(clean_data)
print('Done')

## Remove records with null values and empty strings 

In [19]:
# Remove null values in 'text' column 
merge_df = merge_df.dropna(subset=['text'])
# Remove rows where 'text' column contains empty strings
merge_df = merge_df[merge_df['text'].str.strip() != '']
# Reset the index of the DataFrame
merge_df.reset_index(drop=True, inplace=True)
print(f'Number of records: {len(merge_df)}')

Number of records: 2553768


## Feature Engingeering
1. Identify positive and negative reviews based on rating. 
    - Ratings 4 or greater are positive.
    - Ratings less than 4 are negative.
2. Count how many sentences a review has. 

In [20]:
merge_df['positive_rating'] = 0

# Classify records with rating higher than or equal to 4, positive (1)
merge_df.loc[merge_df['rating'] >= 4, 'positive_rating'] = 1

# Classify records with rating less than and equal to 3, negative (0)
merge_df.loc[merge_df['rating'] < 4 , 'positive_rating'] = 0

In [21]:
# Count the number of sentences in each tokenized sentence list
merge_df['sentence_count'] = merge_df['tokenized_sentences'].apply(len)

In [22]:
# Count the number of words in each sentence 
def count_words(sentence):
    return len(sentence.split())

# Apply the function to each tokenized sentence list
merge_df['word_count_per_sent'] = merge_df['tokenized_sentences'].apply(lambda x: [count_words(sentence) for sentence in x])

In [26]:
merge_df = merge_df.drop(columns = {'Unnamed: 0'})

In [None]:
# def word_count(text):
#     """
#     Counts the number of words in the text.
#     """
#     words = text.split()
#     return len(words)


# def avg_word_length(text):
#     """
#     Returns the average word length in the text. 
#     """
#     # Check for empty or white-space only string 
#     if not text.strip():
#         return 0
        
#     words = text.split()
#     if not words:  # Check if words list is empty
#         return 0
        
#     word_lengths = [len(word) for word in words]
#     avg_word_length = sum(word_lengths)/len(words)
    
#     return(avg_word_length) 


# def exclamation_count(text):
#     """
#     Returns the number of exclamations in the text.
#     """
#     doc = nlp(text)
#     exclamations = []
#     for token in doc: 
#         if token.text == '!':
#             exclamations.append(token.text)
#     return len(exclamations)

# merge_df['word_count'] = merge_df['text'].apply(word_count)
# merge_df['avg_word_length'] = merge_df['text'].apply(avg_word_length)
# merge_df['exclamation_count'] = merge_df['text'].apply(exclamation_count)

## Export to csv

In [30]:
# merge_df.to_csv('../data/merge_df.csv', index = False)

In [39]:
# # Export select columns of merge_df to save space and time when loading data in
# merge_df_select = merge_df[['rating', 'text', 'asin', 'parent_asin', 'year', 'average_rating', 'rating_number', 'price', 'store',
# 'details', 'tokenized_sentences', 'positive_rating', 'sentence_count', 'word_count_per_sent']]
# merge_df_select.to_csv('../data/merge_df_select.csv', index = False)