Link to hugging face model : https://huggingface.co/lihuicham/airbnb-reviews-helpfulness-classifier-roberta-base

Code from `finetuning.ipynb` notebook

Team Members (S001 - Synthetic Expert Team E) :

Li Hui Cham, Isaac Sparrow,  Christopher Arraya, Nicholas Wong, Lei Zhang, Leonard Yang

# Scraping

### WARNING: Will error out because you need to set the environment variables and the initial datasets

In [None]:
%pip install airbnb python-dotenv pandas numpy

Collecting airbnb
  Downloading airbnb-2.3.2.tar.gz (6.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Building wheels for collected packages: airbnb
  Building wheel for airbnb (setup.py) ... [?25l[?25hdone
  Created wheel for airbnb: filename=airbnb-2.3.2-py3-none-any.whl size=6597 sha256=e02906dbdb5e378d2d4c11e42489bffa01652fac2443975156c5bf9a45a6ab48
  Stored in directory: /root/.cache/pip/wheels/e2/58/ac/a7aedd3db2d6c106d8a9f8a9c6caaeae181b46d3925710d34c
Successfully built airbnb
Installing collected packages: python-dotenv, airbnb
Successfully installed airbnb-2.3.2 python-dotenv-1.0.1


In [None]:
# mount to wherever your data is stored

In [None]:
import airbnb
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import requests
import time

### Set Environment Variables

In [None]:
# load_dotenv()
%env AIRBNB_ACCESS_TOKEN=

env: AIRBNB_ACCESS_TOKEN=


### Load Data

In [None]:
listings = pd.read_csv("Listings.csv", encoding='latin1') # Kaggle dataset

# Load or initialize a progress tracker
try:
    progress_tracker = pd.read_csv("Progress_Tracker2.csv")
    # Ensure 'listing_id' column exists in progress_tracker to avoid KeyError
    if 'listing_id' not in progress_tracker.columns:
        raise KeyError("Column 'listing_id' not found in Progress_Tracker2.csv")
    processed_listings_count = len(progress_tracker['listing_id'].unique())  # Count of unique processed listings
    listings = listings[~listings['listing_id'].isin(progress_tracker['listing_id'])]
    # Initialize global_num with the number of unique rows in progress_tracker to start counting from the correct number
    global_num = processed_listings_count
except (FileNotFoundError, KeyError) as e:
    progress_tracker = pd.DataFrame(columns=['listing_id', 'author_id', 'author_first_name', 'comments', 'created_at', 'rating'])
    global_num = 0  # Initialize global_num as 0 if Progress_Tracker.csv does not exist or has no 'listing_id' column
    processed_listings_count = 0  # Initialize processed_listings_count as 0 if Progress_Tracker.csv does not exist or has no 'listing_id' column

### Fetch Reviews from Listing

In [None]:
api = airbnb.Api(os.getenv("AIRBNB_ACCESS_TOKEN"))

def get_reviews_and_log(x):
  global global_num, start_time, batch_times, processed_listings_count  # Declare processed_listings_count as global to fix NameError
  try:
    global_num += 1
    total_processed = global_num + processed_listings_count  # Total processed including already processed listings
    print(f"Fetching reviews for listing ID: {x} ({total_processed}/{len(listings) + processed_listings_count})")  # Correctly reflects the total progress
    reviews = api.get_reviews(x)
    # No need to load JSON since the response is already a dictionary
    reviews_data = reviews['reviews']
    reviews_extracted = []
    for review in reviews_data:
        reviews_extracted.append({
            'listing_id': x,
            'author_id': review['author_id'],
            'author_first_name': review['author']['first_name'],
            'comments': review['comments'],
            'created_at': review['created_at'],
            'rating': review['rating']
        })
    # Save progress intermittently
    if total_processed % 10 == 0:  # Save every 10 listings
        temp_progress = pd.DataFrame(reviews_extracted)
        if global_num == 10:  # Check if it's the first batch to save, then include header
            temp_progress.to_csv("Progress_Tracker2.csv", mode='a', header=True, index=False)
        else:
            temp_progress.to_csv("Progress_Tracker2.csv", mode='a', header=False, index=False)
        # Calculate and print the time taken for each batch of 10
        end_time = time.time()
        batch_time = end_time - start_time
        batch_times.append(batch_time)  # Append the current batch time to the list
        average_batch_time = sum(batch_times) / len(batch_times)  # Calculate the average time per batch
        print(f"Time taken for this batch of 10: {batch_time} seconds")
        # Estimate total time more accurately using the average batch time
        remaining_batches = (len(listings) + processed_listings_count - total_processed) / 10
        estimated_total_time = remaining_batches * average_batch_time
        print(f"Estimated total time remaining: {estimated_total_time} seconds")
        # Reset start time for the next batch
        start_time = time.time()
  except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred while fetching reviews for listing ID: {x} - {http_err}")
    reviews_extracted = [{'listing_id': x, 'author_id': None, 'author_first_name': None, 'comments': 'Error fetching reviews', 'created_at': None, 'rating': None}]
  return reviews_extracted

### Run Review Scraper

In [None]:
# Initialize start time and batch_times list before processing listings
start_time = time.time()
batch_times = []  # List to store the time taken for each batch

all_reviews = []
for listing_id in listings['listing_id']:
    all_reviews.extend(get_reviews_and_log(listing_id))

# Convert all reviews to DataFrame and save
reviews_df = pd.DataFrame(all_reviews)
if global_num == len(listings):  # Check if processing is done for all listings, then include header for the final save
    reviews_df.to_csv("Progress_Tracker2.csv", mode='a', header=True, index=False)
else:
    reviews_df.to_csv("Progress_Tracker2.csv", mode='a', header=False, index=False)

listings.to_csv("Listings_with_Reviews_Info.csv", index=False, encoding='utf-8')

# Synthetic Labeling

In [None]:
%pip install openai instructor

In [None]:
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel, Field
import instructor
import enum

In [None]:
# load_dotenv()
%env OPENAI_API_KEY=

env: OPENAI_API_KEY=


### Repsonse Model

Pydantic classes that will coerce the output for the LLM

In [None]:
class KeyAspectsLabel(enum.IntEnum):
  ONE = 1
  TWO = 2
  THREE = 3
  FOUR = 4
  FIVE = 5

class DecisionMakingAdviceLabel(enum.IntEnum):
  ONE = 1
  TWO = 2
  THREE = 3
  FOUR = 4
  FIVE = 5

class ExpertiseClaimsLabel(enum.IntEnum):
  ONE = 1
  TWO = 2
  THREE = 3
  FOUR = 4
  FIVE = 5


class Prediction(BaseModel):
  key_aspects_rating: KeyAspectsLabel = Field(..., description="1 indicates the review contains no specific aspect of the AirBnB listing, 2 indicates one specific aspect of the listing is mentioned, ..., 5 indicates four or more specific aspects of the listing are mentioned.")
  decision_making_advice_rating: DecisionMakingAdviceLabel = Field(..., description="1 indicates the review describes personal experiences vaguely without advising on renting decision, 2 indicates the review describes personal experiences clearly without advising on renting decisions, 3 indicates the review offers an implicit advice on whether to rent the listing, 4 indicates the review offers explicit advice on whether to rent the listing, and 5 indicates the review explicitly advises who should and should not rent the listing.")
  expertise_claims_rating: ExpertiseClaimsLabel = Field(..., description="1 indicates the review makes no claim of the reviewer's expertise, 2 indicates the review suggests familiarity of the reviewer with listings similar to the one being reviewed, 3 indicates the review suggests familiarity with the listing under review, 4 indicates the review makes claims of the reviewer's expertise without justification, 5 is 4 but with a justification provided.")

### Initialize OpenAI Client with Instructor Patch

In [None]:
client = instructor.from_openai(OpenAI())

### System Prompt

In [None]:
SYSTEM_PROMPT = """
You are a data annotator. Your job is to score reviews of AirBnB listings based on three criteria: Key aspects, Decision-making advice, and Expertise claims. Here are a few examples of scored reviews:

Review 1: ‘The apartment is really beautiful. The location is really good. We did not have a chance to meet Dominique but her answer is quick and helpful! We definitely recommend this place! We will be back in the future! Thank you Dominique!’

Key Aspects Score: 3
Decision-making Advice Score: 4
Expertise Claims Score: 3

Review 2: ‘We were very well received by Gilles, who proved to be kind and attentive.The apartment corresponds to what is described in the ad, is clean, well decorated and equipped with basic items.Our stay there was very good, I recommend for the AP, the host and its location, in the center of Marais, charming neighborhood with museums, galleries, shops, bars and restaurants, easy access to bus and metro.’

Key Aspects Score: 5
Decision-making Advice Score: 3
Expertise Claims Score: 4

Review 3: ‘it was great... good location, nice place and Cyril is very helpful’

Key Aspects Score: 2
Decision-making Advice Score: 1
Expertise Claims Score: 1
"""

### Classification Function

In [None]:
def classify(data: str) -> Prediction:
  key_aspects_rating, completion = client.chat.completions.create_with_completion(
    model="gpt-4-turbo-2024-04-09",
    response_model=Prediction,
    messages=[
      {"role": "system", "content": SYSTEM_PROMPT},
      {"role": "user", "content": f"Score this review: {data}"}
    ]
  )

  return key_aspects_rating

### Run Classifier

In [None]:
import pandas as pd
import logging
import os

# Set up logging
logging.basicConfig(filename='classification.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the reviews from the CSV file
df = pd.read_csv('training_raw_numbered.csv')
print(df.head())

# Check if the output file already exists and find the last processed row_number
output_file = 'classified_reviews.csv'
if os.path.exists(output_file):
    existing_df = pd.read_csv(output_file)
    last_processed_row_number = existing_df['row_number'].max()
    df = df[df['row_number'] > last_processed_row_number]
    logging.info(f"Resuming from row number {last_processed_row_number}")
else:
    # Create a new file with headers if not resuming
    pd.DataFrame(columns=['row_number', 'review', 'listing_id', 'key_aspects_rating', 'decision_making_advice_rating', 'expertise_claims_rating']).to_csv(output_file, index=False)
    logging.info("Starting new classification file.")

total_reviews = len(df)
processed_count = 0

# Function to process and classify reviews in batches and save to new CSV
def process_reviews(df):
    results = []
    for index, row in df.iterrows():
        try:
            prediction = classify(row['review'])
            results.append([row['row_number'], row['review'], row['listing_id'], prediction.key_aspects_rating, prediction.decision_making_advice_rating, prediction.expertise_claims_rating])
            logging.info(f"Processed review row number {row['row_number']}")
        except Exception as e:
            logging.error(f"Error processing review row number {row['row_number']}: {str(e)}")

        # Save every 10 reviews
        if (index + 1) % 10 == 0 or index == len(df) - 1:
            pd.DataFrame(results, columns=['row_number', 'review', 'listing_id', 'key_aspects_rating', 'decision_making_advice_rating', 'expertise_claims_rating']).to_csv(output_file, mode='a', header=False, index=False)
            results = []  # Reset the results list for the next batch
            logging.info("Saved batch of reviews to classified_reviews.csv")

        # Print progress
        global processed_count
        processed_count += 1
        print(f"Processed {processed_count}/{total_reviews} reviews.")

    # Save any remaining reviews not yet saved
    if results:
        pd.DataFrame(results, columns=['row_number', 'review', 'listing_id', 'key_aspects_rating', 'decision_making_advice_rating', 'expertise_claims_rating']).to_csv(output_file, mode='a', header=False, index=False)
        logging.info("Saved final batch of reviews to classified_reviews.csv")

# Run the processing function
process_reviews(df)