In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
df = pd.read_csv("../Data/Raw/GoodReads_500.csv")
df_api = pd.read_csv("../Data/Raw/Googlebooks.csv")

# 1. Cleaning GoodReads Scraped Data 

In [None]:
df.head(3)

In [None]:
# 1. Check for missing/null values in key columns and flag these rows for manual review.
#    Create a new column 'Review_Flag' that indicates if any key field is missing.
key_columns = ['Title', 'Author', 'Avg_Rating', 'Genres']
df['Review_Flag'] = df[key_columns].isnull().any(axis=1)

missing_key_col = df[df['Review_Flag']]

In [None]:
# 2. Remove duplicate rows. (Assuming duplicate rows have the same values in all columns.)
df.drop_duplicates(inplace=True)

In [None]:
# Function to extract text within parentheses and store it in the Series column.
def extract_series(title):
    # This regex captures text between '(' and ')'
    match = re.search(r'\((.*?)\)', title)
    if match:
        return match.group(1).strip()
    return ''

# Function to remove the parentheses and their contents from the title.
def clean_title(title):
    return re.sub(r'\s*\(.*?\)', '', title).strip()

# Create the Series column from the Title column.
df['Series'] = df['Title'].apply(extract_series)
# Clean the Title column by removing the extracted text.
df['Title'] = df['Title'].apply(clean_title)

In [None]:
# 4. Split the 'Format_And_Page' column into 'Page_Number' and 'Format'
#    Example formats: "232 pages, paperback" or "232, paperback". We'll extract digits as page number and the rest as format.
def split_format_and_page(text):
    # Ensure the text is a string (handle missing or non-string data)
    if pd.isnull(text):
        return pd.NA, pd.NA
    # Extract page number (digits)
    page_match = re.search(r'(\d+)', text)
    page_number = page_match.group(1) if page_match else pd.NA
    # Remove the page number and non-alpha characters at the beginning to extract the format.
    # This approach assumes that after the number there is some text indicating format.
    format_text = re.sub(r'\d+', '', text)
    # Remove common words like "pages" and punctuation
    format_text = re.sub(r'pages?', '', format_text, flags=re.IGNORECASE)
    format_text = format_text.replace(',', '').strip()
    return page_number, format_text

df[['Page_Number', 'Format']] = df['Format_and_Page'].apply(lambda x: pd.Series(split_format_and_page(x)))


In [None]:
df.head()

In [None]:
# 5. Clean the 'Genres' column: Remove newline characters and ensure they are comma-separated.
df['Genres'] = df['Genres'].astype(str).replace(r'\n', ', ', regex=True).str.strip()

In [None]:
df = df.drop(['Unnamed: 0', 'Format_and_Page', 'Review_Flag'], axis=1)

In [None]:
#Casting avg rating to float and page number to integer
df['Avg_Rating'] = pd.to_numeric(df['Avg_Rating'], errors='coerce')
df['Page_Number'] = pd.to_numeric(df['Page_Number'], errors='coerce').astype('Int64')

In [None]:
df.info()

# 2. Cleaning Google API Data

In [None]:
df_api.head()

In [None]:
# 1. Check for missing/null values in key columns and flag these rows for manual review.
#    Create a new column 'Review_Flag' that indicates if any key field is missing.
key_columns = ['Book Title', 'Authors', 'Genres']
df_api['Review_Flag'] = df_api[key_columns].isnull().any(axis=1)

missing_key_col = df_api[df_api['Review_Flag']]

df_api = df_api[df_api['Review_Flag'] == False]


In [None]:
# 2. Remove books with less than 20 pages

df_api = df_api[df_api['Number of Pages'] >= 20]


In [None]:
df_api.info()

In [None]:
df_api = df_api.drop(['ASIN','Review_Flag'], axis=1)

In [None]:
df_api.info()

In [None]:
# 2. Remove duplicate rows. (Assuming duplicate rows have the same values in all columns.)
df_api.drop_duplicates(inplace=True)

In [None]:
df_api = df_api.drop_duplicates(subset=['Book Title'], keep='first')

In [None]:
df_api

In [None]:
# Rename columns in the Google file to match the GoodReads structure.
# Adjust the mappings based on your actual column names.
df_api.rename(columns={
    'Book Title':'Title',
    'Authors':'Author',
    'Rating':'Avg_Rating',
    'Number of Pages':'Page_Number' 
}, inplace=True)

df_api

In [None]:
# Add any missing columns to google_df with empty values so both DataFrames have the same columns.
for col in df.columns:
    if col not in df_api.columns:
        df_api[col] = ''

# Ensure both DataFrames have the same column order
df_api = df_api[df.columns]

In [None]:
df_api.info()

In [None]:
df_api['ISBN'] = pd.to_numeric(df_api['ISBN'], errors='coerce').astype('Int64')

In [None]:
#Casting avg rating to float and page number to integer
df_api['Rank'] = pd.to_numeric(df_api['Rank'], errors='coerce').astype('Int64')
df_api['Num_Ratings'] = pd.to_numeric(df_api['Num_Ratings'], errors='coerce').astype('Int64')
df_api['ISBN'] = df_api['ISBN'].astype('object')
df_api['Page_Number'] = pd.to_numeric(df_api['Page_Number'], errors='coerce').astype('Int64')

In [None]:
#Saving the Cleaned Separate Datasets Before Merge
df_api.to_csv("../Data/Clean/Google_API_Clean.csv")
df.to_csv("../Data/Clean/GoodReads_Clean.csv")

In [None]:
# Concatenate the two DataFrames
combined_df = pd.concat([df, df_api], ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('combined_books.csv', index=False)

In [None]:
combined_df

In [None]:
combined_df = combined_df.drop_duplicates(subset=['ISBN'], keep='first')

In [None]:
combined_df.rename(columns={
    'Rank':'Good_Reads_Popularity_Rank'
}, inplace=True)

In [None]:
combined_df.reset_index(drop=True,inplace=True)

In [None]:
#Saving the final dataset
combined_df.to_csv("../Data/Clean/Combined_Book_Data.csv")