In [45]:
import os
import re
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import logging
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

In [46]:
absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
absa_model.to(device)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

In [47]:
def process_batch(batch_reviews, aspect):
    aspect_reviews = [f"{aspect}: {review}" for review in batch_reviews]
    inputs = absa_tokenizer(aspect_reviews, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = absa_model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1)
    return probs.cpu().numpy()

In [48]:
aspect_keywords = {
    'cleanliness': ['clean', 'dirty', 'smell', 'stink', 'stunk', 'filthy'],
    'room': ['room', 'bed', 'suite', 'large'],
    'service': ['service', 'staff', 'help', 'support'],
    'location': ['location', 'close', 'area', 'far'],
    'value': ['value', 'worth', 'price', 'cost'],
    'safety': ['safe', 'safety', 'secure', 'danger', 'dangerous', 'security'],
    'comfort': ['comfort', 'comfortable', 'uncomfortable'],
    'transportation': ['bus', 'metro', 'station', 'close', 'walk', 'transport', 'transportation'],
    'noise': ['sound', 'volume', 'noisy', 'noise', 'silent']
}

weights = {'negative': -1, 'neutral': 0, 'positive': 1}

def is_aspect_mentioned(review, aspect):
    keywords = aspect_keywords[aspect]
    return any(keyword in review for keyword in keywords)

In [49]:
def process_batch(batch_reviews, aspect):
    aspect_reviews = [f"{aspect}: {review}" for review in batch_reviews]
    inputs = absa_tokenizer(aspect_reviews, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = absa_model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1)
    return probs.cpu().numpy()

def is_aspect_mentioned(review, aspect):
    keywords = aspect_keywords[aspect]
    return any(keyword in review for keyword in keywords)

In [50]:
def process_reviews_for_aspect(df_chunk, aspect, batch_size=16):
    logging.info(f"Processing {aspect} for chunk with size {len(df_chunk)}")
    scores = []
    for i in range(0, len(df_chunk), batch_size):
        batch_reviews = df_chunk['processed_review'][i:i + batch_size]
        logging.info(f"Processing batch {i // batch_size + 1}/{len(df_chunk) // batch_size + 1} for aspect: {aspect}")
        aspect_mentioned = [is_aspect_mentioned(review, aspect) for review in batch_reviews]
        
        logging.info(f"Aspect mentioned in batch: {aspect_mentioned.count(True)}")
        logging.info(f"Aspect not mentioned in batch: {aspect_mentioned.count(False)}")
        
        if any(aspect_mentioned):
            logging.info(f"Aspect {aspect} mentioned in batch {i // batch_size + 1}")
            filtered_reviews = [review for review, mentioned in zip(batch_reviews, aspect_mentioned) if mentioned]
            probs = process_batch(filtered_reviews, aspect)
            batch_scores = [sum(weights[sentiment] * probs[j, k] for k, sentiment in enumerate(['negative', 'neutral', 'positive'])) for j in range(len(probs))]
            # Assign scores to the correct indices
            full_batch_scores = []
            idx = 0
            for mentioned in aspect_mentioned:
                if mentioned:
                    full_batch_scores.append(batch_scores[idx])
                    idx += 1
                else:
                    full_batch_scores.append(0)
            scores.extend(full_batch_scores)
        else:
            logging.info(f"Aspect {aspect} not mentioned in batch {i // batch_size + 1}")
            scores.extend([0] * len(batch_reviews))
    
    logging.info(f"Total processed scores length: {len(scores)}, chunk length: {len(df_chunk)}")
    
    df_chunk[f'{aspect}_score'] = scores
    logging.info(f"Finished processing {aspect} for chunk")
    return df_chunk

In [51]:
def parallel_process(df, aspects, batch_size=16):
    logging.info(f"Starting parallel processing with {cpu_count()} CPUs")
    df_split = np.array_split(df, cpu_count())
    pool = Pool(cpu_count())
    results = []
    for chunk in df_split:
        for aspect in aspects:
            result = pool.apply_async(process_reviews_for_aspect, args=(chunk, aspect, batch_size))
            results.append(result)
    pool.close()
    pool.join()
    combined_results = pd.concat([result.get() for result in results])
    logging.info("Finished parallel processing")
    return combined_results

In [None]:
import os
import re
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import logging
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

In [None]:
# Initialize the model and tokenizer
absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")

# Check if GPU is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
absa_model.to(device)

In [None]:
aspect_keywords = {
    'cleanliness': ['clean', 'dirty', 'smell', 'stink', 'stunk', 'filthy'],
    'room': ['room', 'bed', 'suite', 'large'],
    'service': ['service', 'staff', 'help', 'support'],
    'location': ['location', 'close', 'area', 'far'],
    'value': ['value', 'worth', 'price', 'cost'],
    'safety': ['safe', 'safety', 'secure', 'danger', 'dangerous', 'security'],
    'comfort': ['comfort', 'comfortable', 'uncomfortable'],
    'transportation': ['bus', 'metro', 'station', 'close', 'walk', 'transport', 'transportation'],
    'noise': ['sound', 'volume', 'noisy', 'noise', 'silent']
}

weights = {'negative': -1, 'neutral': 0, 'positive': 1}

In [None]:
def process_batch(batch_reviews, aspect):
    aspect_reviews = [f"{aspect}: {review}" for review in batch_reviews]
    inputs = absa_tokenizer(aspect_reviews, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = absa_model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1)
    return probs.cpu().numpy()

def is_aspect_mentioned(review, aspect):
    keywords = aspect_keywords[aspect]
    return any(keyword in review for keyword in keywords)

def process_reviews_for_aspect(df_chunk, aspect, batch_size=16):
    logging.info(f"Processing {aspect} for chunk with size {len(df_chunk)}")
    scores = []
    for i in range(0, len(df_chunk), batch_size):
        batch_reviews = df_chunk['processed_review'][i:i + batch_size]
        logging.info(f"Processing batch {i // batch_size + 1}/{(len(df_chunk) // batch_size) + 1} for aspect: {aspect}")
        aspect_mentioned = [is_aspect_mentioned(review, aspect) for review in batch_reviews]
        
        logging.info(f"Aspect mentioned in batch: {aspect_mentioned.count(True)}")
        logging.info(f"Aspect not mentioned in batch: {aspect_mentioned.count(False)}")
        
        if any(aspect_mentioned):
            logging.info(f"Aspect {aspect} mentioned in batch {i // batch_size + 1}")
            filtered_reviews = [review for review, mentioned in zip(batch_reviews, aspect_mentioned) if mentioned]
            probs = process_batch(filtered_reviews, aspect)
            batch_scores = [sum(weights[sentiment] * probs[j, k] for k, sentiment in enumerate(['negative', 'neutral', 'positive'])) for j in range(len(probs))]
            # Assign scores to the correct indices
            full_batch_scores = []
            idx = 0
            for mentioned in aspect_mentioned:
                if mentioned:
                    full_batch_scores.append(batch_scores[idx])
                    idx += 1
                else:
                    full_batch_scores.append(0)
            scores.extend(full_batch_scores)
        else:
            logging.info(f"Aspect {aspect} not mentioned in batch {i // batch_size + 1}")
            scores.extend([0] * len(batch_reviews))
    
    logging.info(f"Total processed scores length: {len(scores)}, chunk length: {len(df_chunk)}")
    
    df_chunk[f'{aspect}_score'] = scores
    logging.info(f"Finished processing {aspect} for chunk")
    return df_chunk

def parallel_process(df, aspects, batch_size=16):
    logging.info(f"Starting parallel processing with {cpu_count()} CPUs")
    df_split = np.array_split(df, cpu_count())
    pool = Pool(cpu_count())
    results = []
    for chunk in df_split:
        for aspect in aspects:
            logging.info(f"Submitting task for chunk size {len(chunk)} and aspect {aspect}")
            result = pool.apply_async(process_reviews_for_aspect, args=(chunk, aspect, batch_size))
            results.append(result)
    pool.close()
    pool.join()
    logging.info("All tasks completed, combining results")
    combined_results = pd.concat([result.get() for result in results])
    logging.info("Finished parallel processing")
    return combined_results

In [1]:
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logging.info("Loading data")
    
    review_df = pd.read_csv('csv3/processed_reviews2.csv')
    
    aspects = ['cleanliness', 'room', 'service', 'location', 'value', 'safety', 'comfort', 'transportation', 'noise']
    
    # Initialize scores with 0
    for aspect in aspects:
        review_df[f'{aspect}_score'] = 0
    
    logging.info("Starting parallel processing of reviews")
    review_df = parallel_process(review_df, aspects)
    
    # Save the final DataFrame with overall scores
    logging.info("Saving overall sentiment scores")
    review_df.to_csv('csv3/overall_sentiment_scores.csv', index=False)
    
    # Group by hotel_name and hotel_city, then calculate the mean score for each aspect
    logging.info("Aggregating scores by hotel")
    aggregated_scores = review_df.groupby(['hotel_name', 'hotel_city']).agg({
        'cleanliness_score': 'mean',
        'room_score': 'mean',
        'service_score': 'mean',
        'location_score': 'mean',
        'value_score': 'mean',
        'safety_score': 'mean',
        'comfort_score': 'mean',
        'transportation_score': 'mean',
        'noise_score': 'mean'
    }).reset_index()

    logging.info(f"Aggregated scores: {aggregated_scores.head()}")

    logging.info("Saving aggregated hotel scores")
    aggregated_scores.to_csv('csv3/aggregated_hotel_scores.csv', index=False)
    
    logging.info("Displaying aggregated hotel scores")
    print(aggregated_scores.head())


  from .autonotebook import tqdm as notebook_tqdm
2024-05-15 16:58:20,597 - INFO - Loading data
2024-05-15 16:58:20,604 - INFO - Starting parallel processing of reviews
2024-05-15 16:58:20,604 - INFO - Starting parallel processing with 12 CPUs
  return bound(*args, **kwds)
2024-05-15 16:58:20,699 - INFO - Submitting task for chunk size 8 and aspect cleanliness
2024-05-15 16:58:20,700 - INFO - Submitting task for chunk size 8 and aspect room
2024-05-15 16:58:20,702 - INFO - Submitting task for chunk size 8 and aspect service
2024-05-15 16:58:20,703 - INFO - Submitting task for chunk size 8 and aspect location
2024-05-15 16:58:20,704 - INFO - Submitting task for chunk size 8 and aspect value
2024-05-15 16:58:20,705 - INFO - Submitting task for chunk size 8 and aspect safety
2024-05-15 16:58:20,706 - INFO - Submitting task for chunk size 8 and aspect comfort
2024-05-15 16:58:20,707 - INFO - Submitting task for chunk size 8 and aspect transportation
2024-05-15 16:58:20,708 - INFO - Submitt