# Initialization

In [2]:
#import packages
import time
import pandas as pd
import numpy as np
import json
import csv
from collections import defaultdict

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [8]:
# Iniitialize container to store the reviews and relative content (run once)
data = defaultdict(list)

In [5]:
#initialize chrome driver, make sure Chrome Executer with same version of your Web browser in the current folder
driver = webdriver.Chrome()
driver.set_window_size(1120, 1000)

#url of Rate Your Music
url = 'https://rateyourmusic.com'
driver.get(url)

In [6]:
# Consent adds automatically (will create repetitions for the reviews in some case but can be handled)
# Otherwise click manually on not consenting ads
element = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, './/button[@class="fc-button fc-cta-consent fc-primary-button"]')))
driver.execute_script("arguments[0].click();", element)

In [7]:
# Load the data contaning a list of URLs
with open('list_urls.json', 'r') as file:
    # Load the JSON data into a Python dictionary
    urls = json.load(file)

# Scraping Function

In [10]:
def get_rym_data(url):
    """
    INPUT: RYM URL corresponding to an album
    OUTPUT: Updated dictionary containing as keys all the album ids and url_not_working
    """
    
    # Start the web browser
    driver.get(url) 

    # Detect wrong url
    try :
        driver.find_element(By.XPATH,'.//div[@class="page_error_content"]')
        data["url_not_working"].append(url)
        time.sleep(5)
        return data

    except :
        pass

    # Building a unique identifier for the album
    album_id = url.split("/album/")[1]
    if album_id in data.keys():
        return data
    
    # Indicator if on last page of reviews
    last_page = False

    # Set of already retrieved reviews
    reviews_unique = set()
    
    while not last_page:
        
        # Locate all the reviews for the current page
        reviews = driver.find_elements(By.XPATH,'.//div[@id="column_container_right"]//div[contains(@id,"reviews")]//div[contains(@id, "std")]')
        
        # Loop over the reviews
        for review in reviews:
        
            # Extract the id of the review
            id = review.get_attribute("id")
            if id in reviews_unique:
                break
            else:
                reviews_unique.add(id)
        
            # Buildin the XPATH to detect the date (if ant)
            xpath_date = f".//div[contains(@id,'{id}')]//span[contains(@itemprop, 'datePublished')]"
            xpath_date2 = f".//div[contains(@id,'{id}')]//span[contains(@class, 'review_date')]"
            try:
                date = driver.find_element(By.XPATH, xpath_date).get_attribute('content')
        
            except:
                try:
                    date = driver.find_element(By.XPATH, xpath_date2).text
                except:
                    date = np.nan
        
            content = driver.find_element(By.XPATH, f".//div[contains(@id,'{id}')]//span[@class='rendered_text']").text
        
            # Building the XPATH to dectect the rating (if any)
            xpath_rating = f".//div[contains(@id,'{id}')]//img[contains(@alt, 'stars')]"
                
            try:
                rating = driver.find_element(By.XPATH, xpath_rating).get_attribute('title')
        
            except NoSuchElementException:
                rating = np.nan
        
            # Appending the data to the dictionary        
            data[album_id].append([id, date, content, rating])

        #click next page
        try:
            # Wait for the element to be clickable
            element = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, './/div[@class="review_list"]//a[@class="navlinknext"]')))
            # Click on the element
            driver.execute_script("arguments[0].click();", element)
            time.sleep(5)
                
        #if on last page then break from if condition
        except:
            return data

In [12]:
for count, url in enumerate(urls[:100]):
    print(count)
    get_rym_data(url)

In [22]:
# Creating a copy of the batch
reviews = data.copy()
reviews.pop("url_not_working")
len(data.keys()), len(reviews)

In [16]:
# Saving the batch to a df
df = pd.DataFrame([(album, *review) for album, reviews in reviews.items() for review in reviews],
                  columns=['Album', 'Review ID', 'Date', 'Text', 'Rating'])

In [17]:
# Saving the reviews to a csv
df.to_csv("batch1.csv")

In [18]:
# Write the not working url to a JSON file
with open("url_not_working1.json", "w") as json_file:
    json_file.write(json.dumps(data["url_not_working"]))