In [None]:
import pandas as pd
!pip install gcsfs -q

In [None]:
# Define the paths to your data in Google Cloud Storage.
bucket_name = 'wanderlust-recommender-system'
file_path1 = f'gs://{bucket_name}/Datafiniti_Hotel_Reviews.csv'
file_path2 = f'gs://{bucket_name}/Datafiniti_Hotel_Reviews_Jun19.csv'

# Load each CSV file into its own pandas DataFrame.
try:
    df1 = pd.read_csv(file_path1)
    df2 = pd.read_csv(file_path2)
    print("Successfully loaded both CSV files.")
except Exception as e:
    print(f"Error loading files: {e}")

# Combine the two DataFrames into one.
combined_df = pd.concat([df1, df2], ignore_index=True)

print(f"\nSuccessfully combined the datasets.")
print(f"Original size of df1: {len(df1)} rows")
print(f"Original size of df2: {len(df2)} rows")
print(f"Size of combined dataset: {len(combined_df)} rows")

# --- Cleaning & Preprocessing ---

# Define the columns that are essential for reviews.
required_columns = ['reviews.text', 'reviews.rating', 'name', 'reviews.username']

# Drop rows that are missing any of these critical values.
initial_rows = len(combined_df)
combined_df.dropna(subset=required_columns, inplace=True)
print(f"\nDropped {initial_rows - len(combined_df)} rows with missing required values.")

# Check for and remove duplicate reviews.
initial_rows = len(combined_df)
combined_df.drop_duplicates(subset=['reviews.text', 'reviews.username'], inplace=True)
print(f"Dropped {initial_rows - len(combined_df)} duplicate reviews.")


# --- Creating Numerical IDs ---
combined_df['hotel_id'] = pd.factorize(combined_df['name'])[0]
combined_df['user_id'] = pd.factorize(combined_df['reviews.username'])[0]


# --- Final Verification ---

print("\n--- Final Cleaned DataFrame ---")
combined_df.info()
combined_df.head(20)

# --- Save the Cleaned DataFrame ---

print("Saving the final DataFrame to GCS...")

# Define the destination path in the GCS bucket.
destination_path = f'gs://{bucket_name}/processed/combined_hotel_reviews.parquet'

# Save the DataFrame.
combined_df.to_parquet(destination_path)

print(f"Successfully saved to {destination_path}")