This cell gets the features we want from the data sets and then merges it into a file called cleaned_merged_data.csv 
- Possibly start doing extra cleaning here, only taking into account ratings with a helpful score of at least 50% if there's > 0 reviews. 

In [None]:
import os
import pandas as pd

book_ratings_columns = ['Title', 'review/helpfulness', 'review/score', 'review/summary', 'review/text']
books_data_columns = ['Title', 'description', 'categories', 'ratingsCount']

# Load Books_ratings.csv in chunks to handle large file size
chunk_size = 100000  
book_ratings_chunks = []

for chunk in pd.read_csv('../dataset/Books_rating.csv', usecols=book_ratings_columns, chunksize=chunk_size):
    book_ratings_chunks.append(chunk)

# Concatenate chunks into a single DataFrame
book_ratings = pd.concat(book_ratings_chunks, axis=0)

books_data = pd.read_csv('../dataset/books_data.csv', usecols=books_data_columns)

# Handle missing values by dropping rows with any missing values in important columns
book_ratings.dropna(subset=['Title', 'review/score', 'review/summary', 'review/helpfulness'], inplace=True)
books_data.dropna(subset=['Title', 'description', 'categories', 'ratingsCount'], inplace=True)

# Remove duplicates
book_ratings.drop_duplicates(inplace=True)
books_data.drop_duplicates(inplace=True)

# Filters out any reviews that don't have at least a 50% approval rate on helpfulness
def filter_helpfulness(row):
    try:
        x, y = map(int, row.split('/'))
        print(x, y)
        
        if y > 0 and (x / y) >= 0.5:
            return True
    except ValueError:
        return False
    return False

filtered_rows = book_ratings[book_ratings['review/helpfulness'].apply(filter_helpfulness)]

# Merge datasets based on title
merged_data = pd.merge(filtered_rows, books_data, on='Title')

output_dir = '../cleaned_data' 

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
output_file = os.path.join(output_dir, 'cleaned_merged_data.csv')

merged_data.to_csv(output_file, index=False)

This cell performs basic text cleaning tasks on the reviews: 
- Make them lowercase
- Remove punctuation
- Remove numerical values
- Remove newlines

In [2]:
import pandas as pd
import string
import re

merged_data = pd.read_csv('../cleaned_data/cleaned_merged_data.csv')

def preprocess_text(text): 
    if isinstance(text, str): # If text is a string
        # Make text lowercase
        text = text.lower()
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove numerical values
        text = re.sub(r'\d+', '', text)
        # Remove newlines 
        text = text.replace('\n', ' ')
        return text
    return '' 

columns_for_new_df = merged_data[['Title', 'description']]
descriptions_df = pd.DataFrame(columns_for_new_df)

columns_to_remove = ['review/helpfulness', 'review/score', 'description', 'categories', 'ratingsCount']
merged_data.drop(columns_to_remove, axis = 1, inplace = True)

merged_data['review/summary'] = merged_data['review/summary'].apply(preprocess_text)
merged_data['review/text'] = merged_data['review/text'].apply(preprocess_text)
descriptions_df['description'] = descriptions_df['description'].apply(preprocess_text)

output_file_one = '../cleaned_data/preprocessed_data.csv'
output_file_two = '../cleaned_data/preprocessed_description.csv'
merged_data.to_csv(output_file_one, index = False)
descriptions_df.to_csv(output_file_two, index = False)


This cell performs tokenization and removes stop words on the reviews

In [None]:
import nltk 
nltk.download('punkt')
nltk.download('stopwords')

In [3]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

preprocessedData = pd.read_csv('../cleaned_data/preprocessed_data.csv')
preprocessedDescriptions = pd.read_csv('../cleaned_data/preprocessed_description.csv')

def tokenizeAndRemoveStopwords(text):
    if isinstance(text, str):
        tokens = word_tokenize(text) # tokenize text first to break them down into words/tokens
        stopWords = set(stopwords.words('english')) # removing stopwords 
        tokens = [word for word in tokens if word not in stopWords]
        return tokens
    else:
        return []

preprocessedData['review/summary'] = preprocessedData['review/summary'].apply(tokenizeAndRemoveStopwords)
preprocessedData['review/text'] = preprocessedData['review/text'].apply(tokenizeAndRemoveStopwords)
preprocessedDescriptions['description'] = preprocessedDescriptions['description'].apply(tokenizeAndRemoveStopwords)

outputFileOne = '../cleaned_data/tokenized_data.csv'
outputFileTwo = '../cleaned_data/tokenized_descriptions.csv'

preprocessedData.to_csv(outputFileOne, index=False)
preprocessedDescriptions.to_csv(outputFileTwo, index=False)

This cell performs lematization and creates bi-grams.