In [None]:
# --- Install Libraries ---

!pip install sentence-transformers pyarrow gcsfs datasets accelerate>=0.26.0 -q

# --- Import Libraries ---
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# --- Load the Processed Data ---
bucket_name = 'wanderlust-recommender-system'
file_path = f'gs://{bucket_name}/processed/combined_hotel_reviews.parquet'

# Load the data from the Parquet file.
df = pd.read_parquet(file_path)

# --- Verification ---
print("--- Successfully loaded processed data ---")

display(df.head())

# Verify the DataFrame's structure and data types.
df.info()

In [None]:
import random
from tqdm.notebook import tqdm

# --- Separate reviews by sentiment ---
# We create two lists of reviews: one for positive (rating > 3) and one for negative (rating <= 3).
# We also create dictionaries for quick lookup of reviews by hotel ID.
positive_reviews_df = df[df["reviews.rating"]>3].copy()
negative_reviews_df = df[df["reviews.rating"]<=3].copy()

pos_reviews_by_hotel = positive_reviews_df.groupby("hotel_id")["reviews.text"].apply(list).to_dict()
neg_reviews_by_hotel = negative_reviews_df.groupby("hotel_id")["reviews.text"].apply(list).to_dict()

# --- Generate the training triplets ---
train_examples = []
# We create 10k of each type for a total of 20k
n_examples_per_type = 200 

# --- Part A: Create Intra-Hotel (Sentiment-based) Triplets ---
print(f"Generating {n_examples_per_type} Intra-Hotel triplets ...")

# We need hotels that have at least two positive reviews AND at least one negative review.
valid_hotels_intra = [hid for hid in pos_reviews_by_hotel if len(pos_reviews_by_hotel.get(hid, [])) > 1 and hid in neg_reviews_by_hotel]

for _ in tqdm(range(n_examples_per_type)):
    
    # Select a hotel that meets our criteria.
    hotel_id = random.choice(valid_hotels_intra)
    
    # Pick two different positive reviews for the anchor and positive.
    anchor_review, positive_review = random.sample(pos_reviews_by_hotel[hotel_id], 2)
    
    # Pick one negative review from the SAME hotel.
    negative_review = random.choice(neg_reviews_by_hotel[hotel_id])
    
    
    train_examples.append(InputExample(texts=[anchor_review, positive_review, negative_review]))

    # --- Part B: Create Inter-Hotel (Hotel-based) Triplets ---
print(f"Generating {n_examples_per_type} Inter-Hotel triplets ...")

# We need hotels that have at least two positive reviews.

valid_hotels_inter = [hid for hid in pos_reviews_by_hotel if len(pos_reviews_by_hotel.get(hid, [])) > 1]


for _ in tqdm(range(n_examples_per_type)):
    # Pick a hotel for the anchor/positive.
    anchor_hotel_id = random.choice(valid_hotels_inter)
    
    # Pick two different positive reviews.
    anchor_review, positive_review = random.sample(pos_reviews_by_hotel[anchor_hotel_id], 2)
    
    
    # Pick a hotel for the negative, ensuring it's a different hotel.
    negative_hotel_id = random.choice(valid_hotels_inter)
    if negative_hotel_id == anchor_hotel_id:
        negative_review_id = random.choice(valid_hotels_inter)
    
    # Pick a positive review from that DIFFERENT hotel.
    negative_review = random.choice(pos_reviews_by_hotel[negative_hotel_id])
    
    train_examples.append(InputExample(texts=[anchor_review, positive_review, negative_review]))
    
# --- Shuffle and Verify ---
random.shuffle(train_examples)
print(f"Successfully created and shuffled total of {len(train_examples)} triplets.")

In [None]:
# --- Define the Model ---
# We load a pre-trained model from the sentence-transformers library.
model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name)


# --- Define the DataLoader ---
train_batch_size = 8
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)

# --- Define the Loss Function ---
# We instantiate our TripletLoss function.
train_loss = losses.TripletLoss(model=model, triplet_margin=1.0, \
                                distance_metric=losses.TripletDistanceMetric.COSINE )

# --- Set Training Parameters ---
# Number of epochs 1 or 2 is usually enough to prevent forgetting
num_epochs = 1

# The 'warmup_steps' parameter as 10% of the training steps is a common choice.
warmup_steps = int(len(train_dataloader)*num_epochs*0.1)

print("--- Model and Trainer Configuration ---")
print(f"Base Model: {model_name}")
print(f"Training with {len(train_examples)} triplets")
print(f"Number of epochs: {num_epochs}")
print(f"Batch size: {train_batch_size}")
print(f"Warmup steps: {warmup_steps}")
print(f"Loss function: TripletLoss with Cosine Distance and margin=1.0")



In [None]:
# --- Start Training ---

output_path = 'finetuned_hotel_recommender'
print(f"Fine-tunining now begins and the new model will be saved in {output_path}.")

model.fit(train_objectives=[(train_dataloader,train_loss)], epochs=num_epochs,\
         warmup_steps=warmup_steps, output_path=output_path, show_progress_bar=True)

# Copy local trained model to Google storage bucket
!gsutil -m mv {output_path} gs://{bucket_name}/processed/

print("\n--- Training Completed ---")
print(f"New fine-tuned model is saved in {output_path} folder.")