In [18]:
#retrieve and store in a list of url_ending. For example: [egcu.org,libertyfirstcu.com, etc]
#loop through this list to have a consolidated "soup" and get 2 separated files: details & reviews of all companies
#connect to Postgre using Psycopg and store as tables there
#set up cron job & automated scraping for new reviews daily, then append them to the table. 

In [19]:
import requests 
from bs4 import BeautifulSoup 

atm_url = 'https://www.trustpilot.com/categories/atm'

BASE_URL = "https://www.trustpilot.com"

In [20]:
#function for html parser
def get_soup(url):
    response = requests.get(url)
    return BeautifulSoup(response.content, "html.parser")

In [21]:
soup = get_soup(atm_url)

In [22]:
#function to scrap all the URLs of business page

def get_company_urls(soup_response):
    company_urls = []
    for a in soup.select("a[name='business-unit-card']"):
        url_subdirectory = a.attrs.get("href")
        company_urls.append(BASE_URL+url_subdirectory)
    return company_urls

In [23]:
#function to get the link of the next page button and scrap content on next page
def get_next_page_url(soup_response):
    return soup.select("a[name='pagination-button-next']")[0].attrs.get("href")

In [24]:
#scrap the list of company URLs
company_urls = []

while soup:
    company_urls.extend(get_company_urls(soup))
    next_page = get_next_page_url(soup)
    if next_page:
        soup = get_soup(BASE_URL+next_page)
    else:
        soup = None

In [25]:
print(len(company_urls))

22


In [26]:
#remove duplicates in the URL list if any

deduplicated_company_urls = set(company_urls)

print(len(deduplicated_company_urls))

deduplicated_company_urls

21


{'https://www.trustpilot.com/review/acmeatm.cash',
 'https://www.trustpilot.com/review/asicminersrig.com',
 'https://www.trustpilot.com/review/asicminertech.com',
 'https://www.trustpilot.com/review/cashexpressllc.com',
 'https://www.trustpilot.com/review/coinhubatm.com',
 'https://www.trustpilot.com/review/cryptobaseatm.com',
 'https://www.trustpilot.com/review/cryptodispensers.com',
 'https://www.trustpilot.com/review/egcu.org',
 'https://www.trustpilot.com/review/heritagevalleyfcu.org',
 'https://www.trustpilot.com/review/koinkryptatm.com',
 'https://www.trustpilot.com/review/kryptominerstech.com',
 'https://www.trustpilot.com/review/libertyfirstcu.com',
 'https://www.trustpilot.com/review/meriwest.com',
 'https://www.trustpilot.com/review/northone.com',
 'https://www.trustpilot.com/review/pnc.com',
 'https://www.trustpilot.com/review/slide2thrive.com',
 'https://www.trustpilot.com/review/swadesh.co',
 'https://www.trustpilot.com/review/thepaymenthq.com',
 'https://www.trustpilot.co

In [27]:
def parse_company_data(sub_soup):
    name = sub_soup.find('span', attrs={'class': 'typography_display-s__qOjh6 typography_appearance-default__AAY17 title_displayName__TtDDM'}).text.strip()
    ratings = sub_soup.find('span', attrs={'class': 'typography_body-l__KUYFJ typography_appearance-subtle__8_H2l styles_text__W4hWi'}).text
    star_elements = sub_soup.find_all('p', attrs={'class': 'typography_body-m__xgxZ_ typography_appearance-default__AAY17 styles_cell__qnPHy styles_percentageCell__cHAnb'})
    stars = [star_element.text.strip() for star_element in star_elements]
    return [name, ratings] + stars

In [28]:
company_data = []
for company_url in deduplicated_company_urls:
    subpage = get_soup(company_url)
    company_data.append(parse_company_data(subpage))

In [29]:
import pandas as pd

columns = ['company_name', 'rating_class', 'star_5', 'star_4', 'star_3', 'star_2', 'star_1']

df_details = pd.DataFrame(data=company_data, columns=columns)

In [30]:
#cleaning the data in dataframe
df_details['total_reviews'] = df_details['rating_class'].apply(lambda x: x.split(' ')[0])
df_details['rating_class'] = df_details['rating_class'].apply(lambda x: x.split(' ')[-1])
df_details.drop(df_details[df_details['total_reviews'] == '0'].index, inplace = True)
df_details

Unnamed: 0,company_name,rating_class,star_5,star_4,star_3,star_2,star_1,total_reviews
0,Meriwest Credit Union,Average,62%,13%,0%,0%,25%,8
1,Evergreen Credit Union,Excellent,97%,2%,<1%,<1%,<1%,224
2,ASDFTEST,Average,100%,0%,0%,0%,0%,1
3,Swadesh,Excellent,100%,0%,0%,0%,0%,9
5,Liberty First Credit Union,Excellent,92%,4%,2%,1%,1%,198
6,Coinhub Bitcoin ATM,Average,0%,0%,0%,0%,100%,2
9,Coin Cloud,Poor,14%,4%,6%,8%,68%,104
10,Asicminertech,Poor,14%,7%,0%,4%,75%,28
11,Kryptominerstech,Average,100%,0%,0%,0%,0%,1
12,PNC Bank,Bad,5%,1%,2%,4%,88%,842


In [31]:
df_details.to_csv('company_details.csv', index=False)

In [32]:
#Establish connection with PostgreSQL using psycopg2

import psycopg2
import numpy as np
import psycopg2.extras as extras

#Function to insert values into existing table
def execute_values(conn, df, table):
  
    tuples = [tuple(x) for x in df.to_numpy()]
  
    col = ','.join(list(df.columns))
    # SQL query to execute
    query = "DELETE FROM %s; INSERT INTO %s(%s) VALUES %%s" % (table, table, col)
    
    cursor = conn.cursor()
    try:
        extras.execute_values(cursor, query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("the dataframe is inserted")
    cursor.close()
  
  
conn = psycopg2.connect(
    database="atm_scraping", user='postgres', password='postgres', host='127.0.0.1', port='5432'
)

In [33]:
execute_values(conn, df_details, 'company_details')

the dataframe is inserted
