This cell gets the features we want from the data sets and then merges it into a file called cleaned_mergedData.csv 

In [None]:
import os
import pandas as pd

# Import colummns
bookRatingsColumns = ['Title', 'review/helpfulness', 'review/score', 'review/summary', 'review/text']
booksDataColumns = ['Title', 'description', 'categories', 'ratingsCount']

# Load Books_ratings.csv in chunks to handle large file size
chunkSize = 100000  
bookRatingsChunks = []

for chunk in pd.read_csv('../dataset/Books_rating.csv', usecols=bookRatingsColumns, chunksize=chunkSize):
    bookRatingsChunks.append(chunk)

# Concatenate chunks into a single DataFrame
bookRatings = pd.concat(bookRatingsChunks, axis=0)

# Load booksData.csv with important columns
books_data = pd.read_csv('../dataset/books_data.csv', usecols=booksDataColumns)

# Handle missing values by dropping rows with any missing values in critical columns
bookRatings.dropna(subset=['Title', 'review/score', 'review/summary'], inplace=True)
books_data.dropna(subset=['Title', 'description', 'categories', 'ratingsCount'], inplace=True)

# Remove duplicates
bookRatings.drop_duplicates(inplace=True)
books_data.drop_duplicates(inplace=True)

# Merge datasets on 'Title'
merged_data = pd.merge(bookRatings, books_data, on='Title')

# Define the output directory and file path relative to the current script location
output_dir = '../cleaned_data' 

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
output_file = os.path.join(output_dir, 'cleaned_merged_data.csv')

merged_data.to_csv(output_file, index=False)

This cell performs basic text cleaning tasks on the reviews: 
- Make them lowercase
- Remove punctuation
- Remove numerical values
- Remove newlines

In [9]:
import pandas as pd
import string
import re

merged_data = pd.read_csv('../cleaned_data/cleaned_merged_data.csv')

def preprocessText(text): 
    if isinstance(text, str): # If text is a string
        # Make text lowercase
        text = text.lower()
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove numerical values
        text = re.sub(r'\d+', '', text)
        # Remove newlines 
        text = text.replace('\n', ' ')
        return text
    return '' 

merged_data['review/summary'] = merged_data['review/summary'].apply(preprocessText)
merged_data['review/text'] = merged_data['review/text'].apply(preprocessText)

outputFile = '../cleaned_data/preprocessed_data.csv'
merged_data.to_csv(outputFile, index=False)


This cell performs tokenization and removes stop words on the reviews

In [5]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

preprocessedData = pd.read_csv('../cleaned_data/preprocessed_data.csv')

def tokenizeAndRemoveStopwords(text):
    if isinstance(text, str):
        tokens = word_tokenize(text) # tokenize text first to break them down into words/tokens
        stopWords = set(stopwords.words('english')) # removing stopwords 
        tokens = [word for word in tokens if word not in stopWords]
        return tokens
    else:
        return []

preprocessedData['review/summary'] = preprocessedData['review/summary'].apply(tokenizeAndRemoveStopwords)
preprocessedData['review/text'] = preprocessedData['review/text'].apply(tokenizeAndRemoveStopwords)

outputFile = '../cleaned_data/tokenized_data.csv'
preprocessedData.to_csv(outputFile, index=False)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/patricknguyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patricknguyen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
