In [7]:
import os
import pandas as pd

# Define the necessary columns for each dataset
book_ratings_columns = ['Title', 'review/helpfulness', 'review/score', 'review/summary', 'review/text']
books_data_columns = ['Title', 'description', 'categories', 'ratingsCount']

# Load Books_ratings.csv in chunks to handle large file size
chunk_size = 100000  # Adjust based on memory constraints
book_ratings_chunks = []

for chunk in pd.read_csv('../dataset/Books_rating.csv', usecols=book_ratings_columns, chunksize=chunk_size):
    book_ratings_chunks.append(chunk)

# Concatenate chunks into a single DataFrame
book_ratings = pd.concat(book_ratings_chunks, axis=0)

# Load books_data.csv with necessary columns
books_data = pd.read_csv('../dataset/books_data.csv', usecols=books_data_columns)

# Handle missing values by dropping rows with any missing values in critical columns
book_ratings.dropna(subset=['Title', 'review/score', 'review/summary'], inplace=True)
books_data.dropna(subset=['Title', 'description', 'categories', 'ratingsCount'], inplace=True)

# Remove duplicates
book_ratings.drop_duplicates(inplace=True)
books_data.drop_duplicates(inplace=True)

# Merge datasets on 'Title'
merged_data = pd.merge(book_ratings, books_data, on='Title')

# Define the output directory and file path relative to the current script location
output_dir = '../cleaned_data'  # Assuming 'cleaned_data' directory is in the parent directory
output_file = os.path.join(output_dir, 'cleaned_merged_data.csv')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

merged_data.to_csv(output_file, index=False)



Data cleaning and merging completed. The cleaned data has been saved at ../cleaned_data/cleaned_merged_data.csv.
