In [2]:
import requests
import pandas as pd
import sqlalchemy
import bs4 as bs
import numpy as np

## Scrapping single page

### HTTP Request

In [3]:
# Store website in a variable
website = 'https://www.justia.com/lawyers/new-york/new-york-city'
# Get request
response = requests.get(website)

In [4]:
# Status code = request has been made successfully
response

<Response [200]>

In [5]:
# Soup object
soup = bs.BeautifulSoup(response.content, 'html.parser')
#print(soup.prettify())

In [64]:
# results with lawyer's profiles
results = soup.find_all('div', {'data-vars-action':'OrganicListing'})

In [65]:
# Find name of each lawyer
results[0].find('strong', {'class':'lawyer-name'}).get_text().strip()

'Adam Leitman Bailey'

In [118]:
# Find short bias
results[0].find('span', {'class':'-hide-landscape-phone'}).get_text()

'New York, NY Attorney with 26 years of experience'

In [72]:
# Find specialization
results[0].find('span', {'class':'-practices iconed-line-small'}).get_text().strip()

'Business, Foreclosure Defense, Landlord Tenant and Real Estate'

In [75]:
# Find university
results[0].find('span', {'class':'-hide-tablet -law-schools iconed-line-small'}).get_text().strip()

'Syracuse University College of Law'

In [86]:
# Find address
results[0].find('span', {'class':'-address -hide-landscape-tablet'}).get_text().strip().replace("\t", "").replace("\n", ",")

'One Battery Park Plaza,Eighteenth Floor,New York,NY 10004'

In [94]:
# Find phone number
results[0].find('strong', {'class':'-phone'}).get_text().strip('\n')

'(212) 825-0365 '

In [97]:
# Find e-mail link
results[0].find('a', {'class':'-group-button -email'}).get('href')

'https://lawyers.justia.com/lawyer/adam-leitman-bailey-1236057/contact'

In [119]:
# Everything into a loop
name = []
short_bias = []
specialization = []
university = []
address = []
phone = []
mail = []

In [120]:
for result in results:
    # Name
    try:
        name.append(result.find('strong', {'class':'lawyer-name'}).get_text().strip())
    except:
        name.append(np.nan)
         
    # Short Bias
    try:
        short_bias.append(result.find('span', {'class':'-hide-landscape-phone'}).get_text())
    except:
        short_bias.append(np.nan)
        
    # Specialization
    try:
        specialization.append(result.find('span', {'class':'-practices iconed-line-small'}).get_text().strip())
    except:
        specialization.append(np.nan)
        
    # University
    try:    
        university.append(result.find('span', {'class':'-hide-tablet -law-schools iconed-line-small'}).get_text().strip())
    except:
        university.append(np.nan)
        
    # Address
    try:
        address.append(result.find('span', {'class':'-address -hide-landscape-tablet'}).get_text().strip().replace("\t", "").replace("\n", ","))
    except:
        address.append(np.nan)
        
    # Phone
    try:
        phone.append(result.find('strong', {'class':'-phone'}).get_text().strip('\n'))
    except:
        phone.append(np.nan)
        
    # Mail
    try:
        mail.append(result.find('a', {'class':'-group-button -email'}).get('href'))
    except:
        mail.append(np.nan)

In [121]:
# Create dataframe with 
justia_df = pd.DataFrame(list(zip(name, short_bias, specialization, university, address, phone, mail)),
                         columns = ['name','short_bias','specialization','university','address','phone','mail'])

In [125]:
justia_df.head(2)

Unnamed: 0,name,short_bias,specialization,university,address,phone,mail
0,Adam Leitman Bailey,"New York, NY Attorney with 26 years of experience","Business, Foreclosure Defense, Landlord Tenant...",Syracuse University College of Law,"One Battery Park Plaza,Eighteenth Floor,New Yo...",(212) 825-0365,https://lawyers.justia.com/lawyer/adam-leitman...
1,Russel Morgan,"New York, NY Lawyer","Elder, Estate Planning, Probate and Real Estate",New York Law School,"299 Broadway, 17th Floor,New York,NY 10007",(212) 561-4299,https://lawyers.justia.com/lawyer/russel-morga...


In [None]:
# Dataframe into Excel
justia_df.to_excel('justia_df.xlsx', index=False)

## Pagination - 20 pages

In [126]:
name = []
short_bias = []
specialization = []
university = []
address = []
phone = []
mail = []


for i in range(1, 21):
    # Store website in a variable
    website = f'https://www.justia.com/lawyers/new-york/new-york-city?page={i}'
    
    # Get request
    response = requests.get(website)
    
    # Soup object
    soup = bs.BeautifulSoup(response.content, 'html.parser')
    
    # results with lawyer's profiles
    results = soup.find_all('div', {'data-vars-action':'OrganicListing'})
    
    for result in results:
        # Name
        try:
            name.append(result.find('strong', {'class':'lawyer-name'}).get_text().strip())
        except:
            name.append(np.nan)
         
        # Short Bias
        try:
            short_bias.append(result.find('span', {'class':'-hide-landscape-phone'}).get_text())
        except:
            short_bias.append(np.nan)
        
        # Specialization
        try:
            specialization.append(result.find('span', {'class':'-practices iconed-line-small'}).get_text().strip())
        except:
            specialization.append(np.nan)
        
        # University
        try:    
            university.append(result.find('span', {'class':'-hide-tablet -law-schools iconed-line-small'}).get_text().strip())
        except:
            university.append(np.nan)
        
        # Address
        try:
            address.append(result.find('span', {'class':'-address -hide-landscape-tablet'}).get_text().strip().replace("\t", "").replace("\n", ","))
        except:
            address.append(np.nan)
        
        # Phone
        try:
            phone.append(result.find('strong', {'class':'-phone'}).get_text().strip('\n'))
        except:
            phone.append(np.nan)
        
        # Mail
        try:
            mail.append(result.find('a', {'class':'-group-button -email'}).get('href'))
        except:
            mail.append(np.nan)

In [130]:
justia_df_2 = pd.DataFrame(list(zip(name, short_bias, specialization, university, address, phone, mail)),
                         columns = ['name','short_bias','specialization','university','address','phone','mail'])

In [147]:
# Postgres
engine = sqlalchemy.create_engine('postgres://postgres:12345@localhost:5432')

justia_df_2.to_sql('lawyers_ny', engine, index = False)

In [144]:
con = engine.connect()
print(engine.table_names())

['lawyers_ny', 'retornos_fci']


In [145]:
# Convert query into DataFrame
rs = con.execute('SELECT * FROM lawyers_ny LIMIT 20')
lay_20_df = pd.DataFrame(rs, columns = rs.keys())
con.close()

In [146]:
lay_20_df.head(2)

Unnamed: 0,name,short_bias,specialization,university,address,phone,mail
0,V. Jonas Urba,"New York, NY Attorney with 33 years of experience",Employment,Valparaiso University School of Law,"200 Park Ave. Ste 1700,New York,NY 10166-0005",(212) 731-4776,https://lawyers.justia.com/lawyer/v-jonas-urba...
1,Michael Brevda,"New York City, NY Lawyer with 12 years of expe...",Nursing Home,University of Florida Levin College of Law,"325 W 38th Street,#1101B,New York City,NY 10018",(646) 969-5855,https://lawyers.justia.com/lawyer/michael-brev...
