In [45]:
'''
Importing all required libraries for text cleaning.
Includes libraries for text processing, web scraping, tokenization, and more.
'''

import re  # For regular expressions
import string  # For string operations
import nltk  # For natural language processing
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup  # For web scraping (if needed)
import contractions  # For expanding contractions (e.g., can't -> cannot)
import spacy  # For advanced NLP tasks
from nltk.tokenize.toktok import ToktokTokenizer  # Toktok tokenizer for tokenization
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
# Spacy model download command (if not already installed)
# !python -m spacy download en_core_web_sm

# Load Spacy language model
nlp = spacy.load('en_core_web_sm')

# Initialize Toktok tokenizer
tokenizer = ToktokTokenizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ksbuf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ksbuf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [46]:
#read each excel sheet into individuals data frames

df1 = pd.read_excel(r"C:\Users\ksbuf\OneDrive\Desktop\Invista PRoject\document-classification\data\Roomba Reviews.xlsx", sheet_name = 'iRobot Roomba 650')
df2 = pd.read_excel(r"C:\Users\ksbuf\OneDrive\Desktop\Invista PRoject\document-classification\data\Roomba Reviews.xlsx", sheet_name = 'iRobot Roomba 880')

#combined dataframes into one
df_combined = pd.concat([df1,df2], ignore_index = True)

In [47]:
df_cleaned = df_combined.drop(['Date'], axis = 1)

In [48]:
df_cleaned['Product'] = df_cleaned['Product'].replace({'iRobot Roomba 650 for Pets': '650', 'iRobot Roomba 880 for Pets and Allergies': '880'})

In [49]:
#Look for any null values in our reviews and see if they can be filled in with context from the title 
for title in df_cleaned['Title'][df_cleaned['Review'].isna()]:
    print(title)


Truly a wonderful thing.Reminded me of that old Peter, Paul & Mary song, Marvelous Toy."  Truly a wonderful thing.


In [50]:
#it appears that this could be a review itself rather than a title, the title appears to be "Truly a wonderful thing." So we can make assumptions and fix this

df_cleaned[df_cleaned['Review'].isna()].head()

#split up the title and the review
df_cleaned.loc[240, 'Title'] = 'Truly a wonderful thing.'
df_cleaned.loc[240, 'Review'] = 'Reminded me of that old Peter, Paul & Mary song, Marvelous Toy." Truly a wonderful thing.'

#check to make sure the values are correct
df_cleaned.loc[240].head()

df_cleaned['Review'].isna().sum()

0

In [51]:
df_cleaned['Received Five Stars'] = df_cleaned['Rating']
df_cleaned = df_cleaned.drop('Rating', axis = 1)
df_cleaned['Received Five Stars'] = df_cleaned['Received Five Stars'].replace({'Five Stars': 1, 'Not Five Stars': 0})

  df_cleaned['Received Five Stars'] = df_cleaned['Received Five Stars'].replace({'Five Stars': 1, 'Not Five Stars': 0})


In [52]:
# Apply lower() and strip() to both 'Title' and 'Review' columns
df_cleaned[['Title', 'Review']] = df_cleaned[['Title', 'Review']].apply(lambda x: x.str.lower().str.strip())


In [53]:
#split up our contractions prior to us going through and removing the punctuation 
df_cleaned['Title'] = df_cleaned['Title'].fillna('').apply(contractions.fix)

df_cleaned['Review'] = df_cleaned['Review'].apply(contractions.fix)

In [54]:
#remove all of our punctionation

df_cleaned['Title'] = df_cleaned['Title'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)
df_cleaned['Review'] = df_cleaned['Review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)

In [55]:

# Remove extra spaces from the 'Title' and 'Review' columns
df_cleaned[['Title', 'Review']] = df_cleaned[['Title', 'Review']].apply(lambda x: x.str.split().str.join(' '))

In [56]:
# remove any stop words from our text columns

# Tokenize the text in each row of the column, remove stopwords, and join the tokens back
stop_words = set(stopwords.words('english'))

# Apply the stopword removal to the 'Review' column without a function
df_cleaned['Review'] = df_cleaned['Review'].apply(
    lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))


In [57]:
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = ' '.join([token.lemma_ for token in doc])
    return lemmatized_text

df_cleaned['Review'] = df_cleaned['Review'].apply(lemmatize_text)
df_cleaned['Title'] = df_cleaned['Title'].apply(lemmatize_text)

In [58]:
#check for top 20 most common words and see if we need to create a unique stop words list to drop these words
def word_frequency(text, N):
    tokens = word_tokenize(text)  # Tokenizing text into words
    frequency = Counter(tokens)  # Calculating the frequency of each word
    return frequency.most_common(N)  # Returning the top N most frequent words

text = ' '.join(df_cleaned['Review'].astype(str))
top_words = word_frequency(text, 20)

for word in top_words:
    print(word)

('roomba', 2291)
('clean', 2200)
('get', 1774)
('vacuum', 1282)
('floor', 1249)
('time', 1115)
('work', 985)
('one', 970)
('well', 934)
('room', 930)
('hair', 911)
('would', 893)
('go', 826)
('use', 825)
('run', 803)
('day', 771)
('house', 769)
('every', 683)
('thing', 673)
('like', 665)


In [59]:
# create a list with the additional stop words that we should remove or would not be super useful

more_stop_words = [ 'roomba', 'get', 'go', 'thing', 'like']

df_cleaned['Review'] = df_cleaned['Review'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in more_stop_words]))
#check for top 20 most common words and see if we need to create a unique stop words list to drop these words
def word_frequency(text, N):
    tokens = word_tokenize(text)  # Tokenizing text into words
    frequency = Counter(tokens)  # Calculating the frequency of each word
    return frequency.most_common(N)  # Returning the top N most frequent words

text = ' '.join(df_cleaned['Review'].astype(str))
top_words = word_frequency(text, 20)

for word in top_words:
    print(word)

('clean', 2200)
('vacuum', 1282)
('floor', 1249)
('time', 1115)
('work', 985)
('one', 970)
('well', 934)
('room', 930)
('hair', 911)
('would', 893)
('use', 825)
('run', 803)
('day', 771)
('house', 769)
('every', 683)
('pick', 643)
('much', 640)
('good', 639)
('dog', 628)
('great', 591)


In [60]:
# it will be of more use for us if we are able to take all of our text and combine it into a text column

df_cleaned['All text'] = df_cleaned['Title'] + ' ' + df_cleaned['Review']

In [61]:
'''
sp;it out our data that will be used to train the model and the data that we will then test the model on
'''

# Creating DataFrame for test data where 'Received Five Stars' is NaN
df_test_data = df_cleaned[df_cleaned['Received Five Stars'].isna()]

# Creating DataFrame for training data where 'Received Five Stars' is not NaN
df_training_data = df_cleaned[df_cleaned['Received Five Stars'].notna()]

# Getting the shapes of both DataFrames
test_data_shape = df_test_data.shape
training_data_shape = df_training_data.shape

test_data_shape, training_data_shape


((911, 5), (922, 5))

In [62]:
df_cleaned.to_excel(r"C:\Users\ksbuf\OneDrive\Desktop\Invista PRoject\document-classification\data\cleaned_data_roomba.xlsx", index=False)
df_test_data.to_excel(r"C:\Users\ksbuf\OneDrive\Desktop\Invista PRoject\document-classification\data\test_data_roomba.xlsx", index=False)
df_training_data.to_excel(r"C:\Users\ksbuf\OneDrive\Desktop\Invista PRoject\document-classification\data\training_data_roomba.xlsx", index=False)