In [1]:
import pandas as pd
!pip install gcsfs -q

In [2]:

# Define the paths to your data in Google Cloud Storage
bucket_name = 'wanderlust-recommender-system'
file_path1 = f'gs://{bucket_name}/Datafiniti_Hotel_Reviews.csv'
file_path2 = f'gs://{bucket_name}/Datafiniti_Hotel_Reviews_Jun19.csv'

# Load each CSV file into its own pandas DataFrame
try:
    df1 = pd.read_csv(file_path1)
    df2 = pd.read_csv(file_path2)
    print("Successfully loaded both CSV files.")
except Exception as e:
    print(f"Error loading files: {e}")

# Combine the two DataFrames into one
combined_df = pd.concat([df1, df2], ignore_index=True)

print(f"\nSuccessfully combined the datasets.")
print(f"Original size of df1: {len(df1)} rows")
print(f"Original size of df2: {len(df2)} rows")
print(f"Size of combined dataset: {len(combined_df)} rows")

# --- Cleaning & Preprocessing ---

# Define the columns that are essential for reviews
required_columns = ['reviews.text', 'reviews.rating', 'name', 'reviews.username']

# Drop rows that are missing any of these critical values.
initial_rows = len(combined_df)
combined_df.dropna(subset=required_columns, inplace=True)
print(f"\nDropped {initial_rows - len(combined_df)} rows with missing required values.")

# Check for and remove duplicate reviews
initial_rows = len(combined_df)
combined_df.drop_duplicates(subset=['reviews.text', 'reviews.username'], inplace=True)
print(f"Dropped {initial_rows - len(combined_df)} duplicate reviews.")


# --- Creating Numerical IDs ---
combined_df['hotel_id'] = pd.factorize(combined_df['name'])[0]
combined_df['user_id'] = pd.factorize(combined_df['reviews.username'])[0]


# --- Final Verification ---

print("\n--- Final Cleaned DataFrame ---")
combined_df.info()
combined_df.head(20)

Successfully loaded both CSV files.

Successfully combined the datasets.
Original size of df1: 10000 rows
Original size of df2: 10000 rows
Size of combined dataset: 20000 rows

Dropped 1 rows with missing required values.
Dropped 306 duplicate reviews.

--- Final Cleaned DataFrame ---
<class 'pandas.core.frame.DataFrame'>
Index: 19693 entries, 0 to 19999
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    19693 non-null  object 
 1   dateAdded             19693 non-null  object 
 2   dateUpdated           19693 non-null  object 
 3   address               19693 non-null  object 
 4   categories            19693 non-null  object 
 5   primaryCategories     19693 non-null  object 
 6   city                  19693 non-null  object 
 7   country               19693 non-null  object 
 8   keys                  19693 non-null  object 
 9   latitude              19693 non-null  float6

Unnamed: 0,id,dateAdded,dateUpdated,address,categories,primaryCategories,city,country,keys,latitude,...,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,sourceURLs,websites,reviews.dateAdded,hotel_id,user_id
0,AVwc252WIN2L1WUfpqLP,2016-10-30T21:42:42Z,2018-09-10T21:06:27Z,5921 Valencia Cir,"Hotels,Hotels and motels,Hotel and motel reser...",Accommodation & Food Services,Rancho Santa Fe,US,us/ca/ranchosantafe/5921valenciacir/359754519,32.990959,...,Our experience at Rancho Valencia was absolute...,Best romantic vacation ever!!!!,,,Paula,http://www.hotels.com/ho125419/%25252525253Flo...,http://www.ranchovalencia.com,,0,0
1,AVwc252WIN2L1WUfpqLP,2016-10-30T21:42:42Z,2018-09-10T21:06:27Z,5921 Valencia Cir,"Hotels,Hotels and motels,Hotel and motel reser...",Accommodation & Food Services,Rancho Santa Fe,US,us/ca/ranchosantafe/5921valenciacir/359754519,32.990959,...,Amazing place. Everyone was extremely warm and...,Sweet sweet serenity,,,D,http://www.hotels.com/ho125419/%25252525253Flo...,http://www.ranchovalencia.com,,0,1
2,AVwc252WIN2L1WUfpqLP,2016-10-30T21:42:42Z,2018-09-10T21:06:27Z,5921 Valencia Cir,"Hotels,Hotels and motels,Hotel and motel reser...",Accommodation & Food Services,Rancho Santa Fe,US,us/ca/ranchosantafe/5921valenciacir/359754519,32.990959,...,We booked a 3 night stay at Rancho Valencia to...,Amazing Property and Experience,,,Ron,http://www.hotels.com/ho125419/%25252525253Flo...,http://www.ranchovalencia.com,,0,2
3,AVwdOclqIN2L1WUfti38,2015-11-28T19:19:35Z,2018-09-10T21:06:16Z,7520 Teague Rd,"Hotels,Hotels and motels,Travel agencies and b...",Accommodation & Food Services,Hanover,US,us/md/hanover/7520teaguerd/-2043779672,39.155929,...,Currently in bed writing this for the past hr ...,"Never again...beware, if you want sleep.",Richmond,VA,jaeem2016,http://www.yellowbook.com/profile/aloft-arunde...,http://www.starwoodhotels.com/alofthotels/prop...,,1,3
4,AVwdOclqIN2L1WUfti38,2015-11-28T19:19:35Z,2018-09-10T21:06:16Z,7520 Teague Rd,"Hotels,Hotels and motels,Travel agencies and b...",Accommodation & Food Services,Hanover,US,us/md/hanover/7520teaguerd/-2043779672,39.155929,...,I live in Md and the Aloft is my Home away fro...,ALWAYS GREAT STAY...,Laurel,MD,MamaNiaOne,http://www.yellowbook.com/profile/aloft-arunde...,http://www.starwoodhotels.com/alofthotels/prop...,,1,4
5,AVwdOclqIN2L1WUfti38,2015-11-28T19:19:35Z,2018-09-10T21:06:16Z,7520 Teague Rd,"Hotels,Hotels and motels,Travel agencies and b...",Accommodation & Food Services,Hanover,US,us/md/hanover/7520teaguerd/-2043779672,39.155929,...,I stayed here with my family for my daughters ...,Wonderful stay,Laurel,MD,kevan777,http://www.yellowbook.com/profile/aloft-arunde...,http://www.starwoodhotels.com/alofthotels/prop...,,1,5
6,AVwdOclqIN2L1WUfti38,2015-11-28T19:19:35Z,2018-09-10T21:06:16Z,7520 Teague Rd,"Hotels,Hotels and motels,Travel agencies and b...",Accommodation & Food Services,Hanover,US,us/md/hanover/7520teaguerd/-2043779672,39.155929,...,Beautiful rooms and the nicest people working ...,Worth the money,,,Princess F,http://www.yellowbook.com/profile/aloft-arunde...,http://www.starwoodhotels.com/alofthotels/prop...,,1,6
7,AVwdOclqIN2L1WUfti38,2015-11-28T19:19:35Z,2018-09-10T21:06:16Z,7520 Teague Rd,"Hotels,Hotels and motels,Travel agencies and b...",Accommodation & Food Services,Hanover,US,us/md/hanover/7520teaguerd/-2043779672,39.155929,...,We stayed here while visiting Maryland Live!. ...,Great Hotel Experiece!,Clayton,NC,DebMurphy57,http://www.yellowbook.com/profile/aloft-arunde...,http://www.starwoodhotels.com/alofthotels/prop...,,1,7
8,AVwdOclqIN2L1WUfti38,2015-11-28T19:19:35Z,2018-09-10T21:06:16Z,7520 Teague Rd,"Hotels,Hotels and motels,Travel agencies and b...",Accommodation & Food Services,Hanover,US,us/md/hanover/7520teaguerd/-2043779672,39.155929,...,"I travel a lot with my job, so I'm constantly ...",Short stay for business.,Boston,MA,kayleighwillswim1224,http://www.yellowbook.com/profile/aloft-arunde...,http://www.starwoodhotels.com/alofthotels/prop...,,1,8
9,AVwePiAX_7pvs4fzBSAl,2016-03-23T04:22:41Z,2018-09-10T21:06:09Z,315 SE Olympia Dr,"Hotels,Hotels and motels,Hotel and motel reser...",Accommodation & Food Services,Vancouver,US,us/wa/vancouver/315seolympiadr/1818657156,45.619212,...,"In my line of work, I use meeting space in hot...",Amazing employees and facilities,Portland,,KristyWM,https://www.yellowpages.com/vancouver-wa/mip/h...,http://hamptoninn3.hilton.com/en/hotels/washin...,,2,9
