In [15]:
#retrieve and store in a list of url_ending. For example: [egcu.org,libertyfirstcu.com, etc]
#loop through this list to have a consolidated "soup" and get 2 separated files: details & reviews of all companies
#connect to Postgre using Psycopg and store as tables there
#set up cron job & automated scraping for new reviews daily, then append them to the table. 

In [85]:
import requests 
from bs4 import BeautifulSoup 

atm_url = 'https://www.trustpilot.com/categories/atm'

BASE_URL = "https://www.trustpilot.com"

In [86]:
#function for html parser
def get_soup(url):
    response = requests.get(url)
    return BeautifulSoup(response.content, "html.parser")

In [87]:
soup = get_soup(atm_url)

In [88]:
#function to scrap all the URLs of business page

def get_company_urls(soup_response):
    company_urls = []
    for a in soup.select("a[name='business-unit-card']"):
        url_subdirectory = a.attrs.get("href")
        company_urls.append(BASE_URL+url_subdirectory)
    return company_urls

In [89]:
#function to get the link of the next page button and scrap content on next page
def get_next_page_url(soup_response):
    return soup.select("a[name='pagination-button-next']")[0].attrs.get("href")

In [90]:
#scrap the list of company URLs
company_urls = []

while soup:
    company_urls.extend(get_company_urls(soup))
    next_page = get_next_page_url(soup)
    if next_page:
        soup = get_soup(BASE_URL+next_page)
    else:
        soup = None

In [110]:
#remove duplicates in the URL list if any

deduplicated_company_urls = set(company_urls)
deduplicated_company_urls

{'https://www.trustpilot.com/review/acmeatm.cash',
 'https://www.trustpilot.com/review/asicminersrig.com',
 'https://www.trustpilot.com/review/asicminertech.com',
 'https://www.trustpilot.com/review/cashexpressllc.com',
 'https://www.trustpilot.com/review/coinhubatm.com',
 'https://www.trustpilot.com/review/covaultbtm.com',
 'https://www.trustpilot.com/review/cryptobaseatm.com',
 'https://www.trustpilot.com/review/cryptodispensers.com',
 'https://www.trustpilot.com/review/egcu.org',
 'https://www.trustpilot.com/review/heritagevalleyfcu.org',
 'https://www.trustpilot.com/review/koinkryptatm.com',
 'https://www.trustpilot.com/review/kryptominerstech.com',
 'https://www.trustpilot.com/review/libertyfirstcu.com',
 'https://www.trustpilot.com/review/meriwest.com',
 'https://www.trustpilot.com/review/northone.com',
 'https://www.trustpilot.com/review/pnc.com',
 'https://www.trustpilot.com/review/slide2thrive.com',
 'https://www.trustpilot.com/review/swadesh.co',
 'https://www.trustpilot.com/

In [123]:
#Establish connection with PostgreSQL using psycopg2

import psycopg2
import numpy as np
import psycopg2.extras as extras

#Function to insert values into existing table
def execute_values(conn, df, table):
  
    tuples = [tuple(x) for x in df.to_numpy()]
  
    col = ','.join(list(df.columns))
    # SQL query to execute
    query = "INSERT INTO %s(%s) VALUES %%s" % (table, col)
    
    cursor = conn.cursor()
    try:
        extras.execute_values(cursor, query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("the dataframe is inserted")
    cursor.close()
  
  
conn = psycopg2.connect(
    database="atm_scraping", user='postgres', password='postgres', host='127.0.0.1', port='5432'
)

In [124]:
import pandas as pd

for url in deduplicated_company_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    data = []
    name = soup.find('span', attrs={'class': 'typography_display-s__qOjh6 typography_appearance-default__AAY17 title_displayName__TtDDM'}).text.strip()
    reviews = soup.find_all('div', attrs={'class': 'styles_cardWrapper__LcCPA styles_show__HUXRb styles_reviewCard__9HxJJ'})
    for review in reviews:
        review_stars = review.find_all('div', attrs={'class': 'star-rating_starRating__4rrcf star-rating_medium__iN6Ty'})
        stars = [stars.find('img')['alt'].replace('Rated ', '').replace(' stars', '') for stars in review_stars]
        review_dates = review.find('time', attrs={'class': '', 'data-service-review-date-time-ago': 'true'})
        review_title = review.find('h2', attrs={'class': 'typography_heading-s__f7029 typography_appearance-default__AAY17'})
        reviewer_name = review.find('span', attrs={'class': 'typography_heading-xxs__QKBS8 typography_appearance-default__AAY17'})
        review_text = review.find('p', attrs={'class': 'typography_body-l__KUYFJ typography_appearance-default__AAY17 typography_color-black__5LYEn'})
        experience_date = review.find('p', attrs={'class': 'typography_body-m__xgxZ_ typography_appearance-default__AAY17'})
        review_reply_text = review.find('p', attrs={'class': 'typography_body-m__xgxZ_ typography_appearance-default__AAY17 styles_message__shHhX'})
        reply_date_ = review.find('time', attrs={'class': 'typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l styles_replyDate__Iem0_'})
        star = stars[0] if stars else None
        title = review_title.text.strip() if review_title else None
        reviewer = reviewer_name.text.strip() if reviewer_name else None
        text = review_text.text.strip() if review_text else None
        experience = experience_date.text.split(':')[-1].strip() if experience_date else None
        review_date = review_dates.get('datetime').split('T')[0].strip() if review_dates else None
        reply_date = reply_date_.text.strip() if reply_date_ else None
        reply_text = review_reply_text.text.strip() if review_reply_text else None
        data.append([name, star, title, reviewer, text, experience, review_date, reply_date, reply_text])
    columns = ['company_name','review_star', 'review_title', 'reviewer_name', 'review_text', 'experience_date', 'review_date', 'reply_date', 'reply_text']
    df_reviews = pd.DataFrame(data, columns=columns)
    execute_values(conn, df_reviews, 'reviews')
    

the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted
the dataframe is inserted


In [93]:
#Reviews

import pandas as pd

reviews = soup.find_all('div', attrs={'class': 'styles_cardWrapper__LcCPA styles_show__HUXRb styles_reviewCard__9HxJJ'})

# Initialize lists to store the data
data = []

# Extract the data from each review container
for review in reviews:
    # Extract review stars
    review_stars = review.find_all('div', attrs={'class': 'star-rating_starRating__4rrcf star-rating_medium__iN6Ty'})
    stars = [stars.find('img')['alt'].replace('Rated ', '').replace(' stars', '') for stars in review_stars]

    # Extract other review details
    review_dates = review.find('time', attrs={'class': '', 'data-service-review-date-time-ago': 'true'})
    review_title = review.find('h2', attrs={'class': 'typography_heading-s__f7029 typography_appearance-default__AAY17'})
    reviewer_name = review.find('span', attrs={'class': 'typography_heading-xxs__QKBS8 typography_appearance-default__AAY17'})
    review_text = review.find('p', attrs={'class': 'typography_body-l__KUYFJ typography_appearance-default__AAY17 typography_color-black__5LYEn'})
    experience_date = review.find('p', attrs={'class': 'typography_body-m__xgxZ_ typography_appearance-default__AAY17'})
    review_reply_text = review.find('p', attrs={'class': 'typography_body-m__xgxZ_ typography_appearance-default__AAY17 styles_message__shHhX'})
    reply_date_ = review.find('time', attrs={'class': 'typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l styles_replyDate__Iem0_'})

    # Extract the values or set them as None if not found
    star = stars[0] if stars else None
    title = review_title.text.strip() if review_title else None
    reviewer = reviewer_name.text.strip() if reviewer_name else None
    text = review_text.text.strip() if review_text else None
    experience = experience_date.text.split(':')[-1].strip() if experience_date else None
    review_date = review_dates.get('datetime').split('T')[0].strip() if review_dates else None
    reply_date = reply_date_.text.strip() if reply_date_ else None
    reply_text = review_reply_text.text.strip() if review_reply_text else None

    data.append([star, title, reviewer, text, experience, review_date, reply_date, reply_text])

# Define column names for the DataFrame
columns = ['review_star', 'review_title', 'reviewer_name', 'review_text', 'experience_date', 'review_date', 'reply_date', 'review_reply_text']

# Create a DataFrame from the data
df_reviews = pd.DataFrame(data, columns=columns)

df_reviews.head(20)

Unnamed: 0,review_star,review_title,reviewer_name,review_text,experience_date,review_date,reply_date,review_reply_text
0,5 out of 5,Payment HQ is the best Merchant…,Aqua Fix Water Store,Payment HQ is the best Merchant Services out t...,"January 02, 2023",2023-07-07,"Jul 8, 2023",Thank you for sharing your positive feedback! ...
1,5 out of 5,I am lucky to work with The Payment HQ!,Austin Kruck,I am lucky to have had an exceptional experien...,"June 28, 2023",2023-06-29,"Jun 30, 2023","Hi Austin, We appreciate you sharing your posi..."


In [72]:
reviews = get_soup('https://www.trustpilot.com/review/libertyfirstcu.com')
review_rating = reviews.find('time', attrs={'class': '', 'data-service-review-date-time-ago': 'true'}).text
review_rating

'Jul 15, 2023'

In [83]:
#function to scrap reviews: 
def parse_reviews(sub_soup):
    data = []
    reviews = sub_soup.find_all('div', attrs={'class': 'styles_reviewCardInner__EwDq2'})
    for review in reviews: 
        rating = review.select("div[class='styles_reviewHeader__iU9Px']")[0].attrs.get("data-service-review-rating")
        review_date = review.find('time', attrs={'class': '', 'data-service-review-date-time-ago': 'true'}).text
        title = review.find('h2', attrs={'class': 'typography_heading-s__f7029 typography_appearance-default__AAY17'}).text
        reviewer = review.find('span', attrs={'class': 'typography_heading-xxs__QKBS8 typography_appearance-default__AAY17'}).text
        review_text = review.find('p', attrs={'class': 'typography_body-l__KUYFJ typography_appearance-default__AAY17 typography_color-black__5LYEn'})
        experience_date = review.find('p', attrs={'class': 'typography_body-m__xgxZ_ typography_appearance-default__AAY17'})
        reply_text = review.find('p', attrs={'class': 'typography_body-m__xgxZ_ typography_appearance-default__AAY17 styles_message__shHhX'})
        reply_date = review.find('time', attrs={'class': '', 'data-service-review-business-reply-date-time-ago': 'true'})
        experience = experience_date.text.split(':')[-1].strip() if experience_date else None
    return data.append([rating, review_date, title, reviewer, review_text, experience, reply_text, reply_date])

In [84]:
reviews_data = []
for company_url in deduplicated_company_urls:
    company_page = get_soup(company_url)
    company_name = company_page.find('span', attrs={'class': 'typography_display-s__qOjh6 typography_appearance-default__AAY17 title_displayName__TtDDM'}).text.strip()
    review_temp = parse_reviews(company_page)
    reviews_data.append([company_name, review_temp])

UnboundLocalError: local variable 'rating' referenced before assignment

In [26]:
print(len(reviews_data))

22


In [27]:
import pandas as pd
columns = ['company_name','review_star', 'review_title', 'reviewer_name', 'review_text', 'experience_date', 'review_date', 'reply_date', 'review_reply_text']
df_reviews = pd.DataFrame(data=reviews_data, columns=columns)

df_reviews.head(20)

ValueError: 9 columns passed, passed data had 1 columns

In [None]:
#Establish connection with PostgreSQL using psycopg2

import psycopg2
import numpy as np
import psycopg2.extras as extras

#Function to insert values into existing table
def execute_values(conn, df, table):
  
    tuples = [tuple(x) for x in df.to_numpy()]
  
    col = ','.join(list(df.columns))
    # SQL query to execute
    query = "DELETE FROM %s; INSERT INTO %s(%s) VALUES %%s" % (table, table, col)
    
    cursor = conn.cursor()
    try:
        extras.execute_values(cursor, query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("the dataframe is inserted")
    cursor.close()
  
  
conn = psycopg2.connect(
    database="atm_scraping", user='postgres', password='postgres', host='127.0.0.1', port='5432'
)

In [None]:
execute_values(conn, df_reviews, 'reviews')

the dataframe is inserted
