In [None]:
import pandas as pd

df = pd.read_csv("/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/data/processed/Train.csv", usecols=["text"])
df = df.dropna().sample(n=1000000, random_state=42)
df["text"] = df["text"].str.lower()

In [12]:

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk


nltk.download("stopwords")
stop_words = list(stopwords.words("english"))

# Extract top 100 meaningful words
vectorizer = CountVectorizer(stop_words=stop_words, max_features=100)
X = vectorizer.fit_transform(df["text"])

Get the top keywords
top_words = vectorizer.get_feature_names_out()

print("Top 300 words:\n", top_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/018171153/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 300 words:
 ['also' 'always' 'amazing' 'area' 'around' 'away' 'back' 'bar' 'bathroom'
 'beach' 'beautiful' 'bed' 'beds' 'best' 'better' 'bit' 'booked'
 'breakfast' 'check' 'clean' 'close' 'comfortable' 'could' 'day'
 'definitely' 'desk' 'even' 'every' 'everything' 'excellent' 'experience'
 'family' 'first' 'floor' 'food' 'found' 'free' 'friendly' 'front' 'get'
 'go' 'good' 'got' 'great' 'helpful' 'hot' 'hotel' 'large' 'like' 'little'
 'location' 'lovely' 'made' 'make' 'many' 'morning' 'much' 'need' 'next'
 'nice' 'night' 'nights' 'old' 'one' 'people' 'perfect' 'place' 'pool'
 'price' 'quiet' 'quite' 'really' 'recommend' 'resort' 'restaurant'
 'restaurants' 'right' 'room' 'rooms' 'see' 'service' 'shower' 'small'
 'staff' 'stay' 'stayed' 'time' 'town' 'trip' 'two' 'us' 'view' 'walk'
 'want' 'water' 'way' 'well' 'went' 'wonderful' 'would']


In [15]:
!pip3 install -q sentence-transformers transformers scikit-multilearn

In [None]:
import torch
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import numpy as np

topics = [
    "room quality",
    "location",
    "staff service",
    "cleanliness",
    "amenities",
    "value for money",
    "food and restaurant",
    "noise level",
    "comfort"
]
THRESHOLD = 0.4
BATCH_SIZE = 256

# Load Model and Precompute Topic Embeddings
model = SentenceTransformer("/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/NLP_model/model/")
topic_embeddings = model.encode(topics, convert_to_tensor=True)


def process_and_save_topics(input_csv_path, output_csv_path):
    print(f"\nProcessing: {input_csv_path}")
    df = pd.read_csv(input_csv_path)
    
    # Add topic columns
    for topic in topics:
        col_name = "has_" + topic.replace(" ", "_")
        if col_name not in df.columns:
            df[col_name] = 0

    df["text"] = df["text"].fillna("")
    valid_idx = df[df["text"].str.strip() != ""].index.tolist()
    texts = df.loc[valid_idx, "text"].tolist()

    all_preds = []

    for start_idx in tqdm(range(0, len(texts), BATCH_SIZE)):
        batch_texts = texts[start_idx:start_idx+BATCH_SIZE]
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=True)
        cosine_scores = util.cos_sim(batch_embeddings, topic_embeddings)

        batch_pred = (cosine_scores > THRESHOLD).cpu().numpy()
        all_preds.append(batch_pred)

    all_preds = np.vstack(all_preds)

    for i, idx in enumerate(valid_idx):
        for j, topic in enumerate(topics):
            df.at[idx, "has_" + topic.replace(" ", "_")] = int(all_preds[i, j])

    
    df.to_csv(output_csv_path, index=False)


In [None]:

file_paths = {
    "Train": {
        "input": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/data/processed/Train.csv",
        "output": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/data/processed/Train_with_topics.csv"
    },
    "Validation": {
        "input": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/data/processed/Validation.csv",
        "output": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/data/processed/Validation_with_topics.csv"
    },
    "Test": {
        "input": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/data/processed/Test.csv",
        "output": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/data/processed/Test_with_topics.csv"
    }
}


for split_name, paths in file_paths.items():
    process_and_save_topics(paths["input"], paths["output"])
