In [None]:
import pandas as pd

df = pd.read_csv("/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/data/processed/Train.csv", usecols=["text"])
df = df.dropna().sample(n=1000000, random_state=42)
df["text"] = df["text"].str.lower()

In [None]:

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk


nltk.download("stopwords")
stop_words = list(stopwords.words("english"))

# Extract top 100 meaningful words
vectorizer = CountVectorizer(stop_words=stop_words, max_features=100)
X = vectorizer.fit_transform(df["text"])

Get the top keywords
top_words = vectorizer.get_feature_names_out()


In [15]:
!pip3 install -q sentence-transformers transformers scikit-multilearn

In [None]:
import torch
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import numpy as np

topics = [
    "room quality",
    "location",
    "staff service",
    "cleanliness",
    "amenities",
    "value for money",
    "food and restaurant",
    "noise level",
    "comfort"
]
THRESHOLD = 0.4
BATCH_SIZE = 256

# Load Model and Precompute Topic Embeddings
model = SentenceTransformer("/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/models/NLP/Topic-modelling/")
topic_embeddings = model.encode(topics, convert_to_tensor=True)


def process_and_save_topics(input_csv_path, output_csv_path):
    print(f"\nProcessing: {input_csv_path}")
    df = pd.read_csv(input_csv_path)
    
    # Add topic columns
    for topic in topics:
        col_name = "has_" + topic.replace(" ", "_")
        if col_name not in df.columns:
            df[col_name] = 0

    df["text"] = df["text"].fillna("")
    valid_idx = df[df["text"].str.strip() != ""].index.tolist()
    texts = df.loc[valid_idx, "text"].tolist()

    all_preds = []

    for start_idx in tqdm(range(0, len(texts), BATCH_SIZE)):
        batch_texts = texts[start_idx:start_idx+BATCH_SIZE]
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=True)
        cosine_scores = util.cos_sim(batch_embeddings, topic_embeddings)

        batch_pred = (cosine_scores > THRESHOLD).cpu().numpy()
        all_preds.append(batch_pred)

    all_preds = np.vstack(all_preds)

    for i, idx in enumerate(valid_idx):
        for j, topic in enumerate(topics):
            df.at[idx, "has_" + topic.replace(" ", "_")] = int(all_preds[i, j])

    
    df.to_csv(output_csv_path, index=False)


In [None]:

file_paths = {
    "Train": {
        "input": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/data/processed/Train.csv",
        "output": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/data/processed/Train_with_topics.csv"
    },
    "Validation": {
        "input": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/data/processed/Validation.csv",
        "output": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/data/processed/Validation_with_topics.csv"
    },
    "Test": {
        "input": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/data/processed/Test.csv",
        "output": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/data/processed/Test_with_topics.csv"
    }
}


for split_name, paths in file_paths.items():
    process_and_save_topics(paths["input"], paths["output"])


In [None]:

import torch
from transformers import pipeline
import os

# Set paths
paths = {
    "Train": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/data/processed/Train_small.csv",
    "Validation": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/data/processed/Validation_small.csv",
    "Test": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/data/processed/Test_small.csv"
}

save_paths = {
    "Train": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/data/processed/Train_with_topics_sentiment.csv",
    "Validation": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/data/processed/Validation_with_topics_sentiment.csv",
    "Test": "/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/data/scripts/src/data/processed/Test_with_topics_sentiment.csv"
}

# Load Sentiment Pipeline
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="/home/018171153/Hotel_Recommendation_System/Hotel_Recommendation_System/scripts/src/models/NLP/Sentiment-Analysis/",
    device=0 if torch.cuda.is_available() else -1
)

BATCH_SIZE = 1024

# Function to compute sentiment with saving batches
def compute_sentiment_scores_in_batches(df, save_path):
    df["text"] = df["text"].fillna("").astype(str)

 
    if os.path.exists(save_path):
        os.remove(save_path)


    header_saved = False

    for start_idx in tqdm(range(0, len(df), BATCH_SIZE)):
        batch_df = df.iloc[start_idx:start_idx+BATCH_SIZE].copy()
        texts = batch_df["text"].tolist()
        
        preds = sentiment_pipeline(texts, truncation=True)

        batch_scores = [
            pred["score"] if pred["label"] == "POSITIVE" else (1 - pred["score"])
            for pred in preds
        ]

        batch_df["sentiment_score"] = batch_scores


        batch_df.to_csv(save_path, mode='a', header=not header_saved, index=False)
        header_saved = True

# Process each dataset
for set_name, path in paths.items():
    print(f"\nProcessing {set_name} set...")
    df = pd.read_csv(path)
    compute_sentiment_scores_in_batches(df, save_paths[set_name])


