In [14]:
'''
Importing all required libraries for text cleaning.
Includes libraries for text processing, web scraping, tokenization, and more.
'''

import re  # For regular expressions
import string  # For string operations
import nltk  # For natural language processing
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup  # For web scraping (if needed)
import contractions  # For expanding contractions (e.g., can't -> cannot)
# import spacy  # For advanced NLP tasks
from nltk.tokenize.toktok import ToktokTokenizer  # Toktok tokenizer for tokenization
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Spacy model download command (if not already installed)
# !python -m spacy download en_core_web_sm

# Load Spacy language model
# nlp = spacy.load('en_core_web_sm')

# Initialize Toktok tokenizer
tokenizer = ToktokTokenizer()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ksbuf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ksbuf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:


#read each excel sheet into individuals data frames

df1 = pd.read_excel(r"C:\Users\ksbuf\OneDrive\Desktop\Invista PRoject\document-classification\data\Roomba Reviews.xlsx", sheet_name = 'iRobot Roomba 650')
df2 = pd.read_excel(r"C:\Users\ksbuf\OneDrive\Desktop\Invista PRoject\document-classification\data\Roomba Reviews.xlsx", sheet_name = 'iRobot Roomba 880')

#combined dataframes into one
df_combined = pd.concat([df1,df2], ignore_index = True)

In [3]:
df_combined.head()

Unnamed: 0,Date,Product,Rating,Title,Review
0,2015-02-28,iRobot Roomba 650 for Pets,Five Stars,Five Stars,You would not believe how well this works
1,2015-01-12,iRobot Roomba 650 for Pets,Not Five Stars,Four Stars,You just walk away and it does the rest
2,2013-12-26,iRobot Roomba 650 for Pets,Five Stars,Awesome love it.,You have to Roomba proof your house but once t...
3,2015-08-17,iRobot Roomba 650 for Pets,Not Five Stars,Terrible Product - Ruined our Hardwood Floors!,Wow.. I don't know what to say. This thing wor...
4,2015-12-28,iRobot Roomba 650 for Pets,Five Stars,Super-impressed by how well it works!,"Wow, wow, WOW! I wanted to get one of these a..."


In [4]:
df_cleaned = df_combined.drop(['Date'], axis = 1)

In [5]:
df_cleaned['Product'] = df_cleaned['Product'].replace({'iRobot Roomba 650 for Pets': '650', 'iRobot Roomba 880 for Pets and Allergies': '880'})
df_cleaned.head()

Unnamed: 0,Product,Rating,Title,Review
0,650,Five Stars,Five Stars,You would not believe how well this works
1,650,Not Five Stars,Four Stars,You just walk away and it does the rest
2,650,Five Stars,Awesome love it.,You have to Roomba proof your house but once t...
3,650,Not Five Stars,Terrible Product - Ruined our Hardwood Floors!,Wow.. I don't know what to say. This thing wor...
4,650,Five Stars,Super-impressed by how well it works!,"Wow, wow, WOW! I wanted to get one of these a..."


In [6]:
#Look for any null values in our reviews and see if they can be filled in with context from the title 
for title in df_cleaned['Title'][df_cleaned['Review'].isna()]:
    print(title)


Truly a wonderful thing.Reminded me of that old Peter, Paul & Mary song, Marvelous Toy."  Truly a wonderful thing.


In [7]:
#it appears that this could be a review itself rather than a title, the title appears to be "Truly a wonderful thing." So we can make assumptions and fix this

df_cleaned[df_cleaned['Review'].isna()].head()

#split up the title and the review
df_cleaned.loc[240, 'Title'] = 'Truly a wonderful thing.'
df_cleaned.loc[240, 'Review'] = 'Reminded me of that old Peter, Paul & Mary song, Marvelous Toy." Truly a wonderful thing.'

#check to make sure the values are correct
df_cleaned.loc[240].head()

df_cleaned['Review'].isna().sum()

0

In [8]:
df_cleaned['Received Five Stars'] = df_cleaned['Rating']
df_cleaned = df_cleaned.drop('Rating', axis = 1)
df_cleaned['Received Five Stars'] = df_cleaned['Received Five Stars'].replace({'Five Stars': 1, 'Not Five Stars': 0})

  df_cleaned['Received Five Stars'] = df_cleaned['Received Five Stars'].replace({'Five Stars': 1, 'Not Five Stars': 0})


In [9]:
df_cleaned.head()

Unnamed: 0,Product,Title,Review,Received Five Stars
0,650,Five Stars,You would not believe how well this works,1.0
1,650,Four Stars,You just walk away and it does the rest,0.0
2,650,Awesome love it.,You have to Roomba proof your house but once t...,1.0
3,650,Terrible Product - Ruined our Hardwood Floors!,Wow.. I don't know what to say. This thing wor...,0.0
4,650,Super-impressed by how well it works!,"Wow, wow, WOW! I wanted to get one of these a...",1.0


In [10]:
# Apply lower() and strip() to both 'Title' and 'Review' columns
df_cleaned[['Title', 'Review']] = df_cleaned[['Title', 'Review']].apply(lambda x: x.str.lower().str.strip())


In [12]:
#split up our contractions prior to us going through and removing the punctuation 
df_cleaned['Title'] = df_cleaned['Title'].fillna('').apply(contractions.fix)

df_cleaned['Review'] = df_cleaned['Review'].apply(contractions.fix)

In [13]:
#remove all of our punctionation

df_cleaned['Title'] = df_cleaned['Title'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)
df_cleaned['Review'] = df_cleaned['Review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)
df_cleaned

Unnamed: 0,Product,Title,Review,Received Five Stars
0,650,five stars,you would not believe how well this works,1.0
1,650,four stars,you just walk away and it does the rest,0.0
2,650,awesome love it,you have to roomba proof your house but once t...,1.0
3,650,terrible product ruined our hardwood floors,wow i do not know what to say this thing worke...,0.0
4,650,superimpressed by how well it works,wow wow wow i wanted to get one of these a fe...,1.0
...,...,...,...,...
1828,880,the awesome 880 fixed roombas long existing flaws,after many years and 3 roombas i almost gave u...,
1829,880,i like it better than neato robotics,after having to return two neato botvac vacuum...,
1830,880,actually exceeded my expectations,a few months ago my wife and i bought our drea...,
1831,880,high hopes big disappointment,880 was unable to pick up the dog hair from ou...,


In [16]:
# it will be of more use for us if we are able to take all of our text and combine it into a text column

df_cleaned['All text'] = df_cleaned['Title'] + ' ' + df_cleaned['Review']

df_cleaned

Unnamed: 0,Product,Title,Review,Received Five Stars,All text
0,650,five stars,you would not believe how well this works,1.0,five stars you would not believe how well this...
1,650,four stars,you just walk away and it does the rest,0.0,four stars you just walk away and it does the ...
2,650,awesome love it,you have to roomba proof your house but once t...,1.0,awesome love it you have to roomba proof your ...
3,650,terrible product ruined our hardwood floors,wow i do not know what to say this thing worke...,0.0,terrible product ruined our hardwood floors w...
4,650,superimpressed by how well it works,wow wow wow i wanted to get one of these a fe...,1.0,superimpressed by how well it works wow wow wo...
...,...,...,...,...,...
1828,880,the awesome 880 fixed roombas long existing flaws,after many years and 3 roombas i almost gave u...,,the awesome 880 fixed roombas long existing fl...
1829,880,i like it better than neato robotics,after having to return two neato botvac vacuum...,,i like it better than neato robotics after hav...
1830,880,actually exceeded my expectations,a few months ago my wife and i bought our drea...,,actually exceeded my expectations a few months...
1831,880,high hopes big disappointment,880 was unable to pick up the dog hair from ou...,,high hopes big disappointment 880 was unable t...
