## Javascript Dynamic Scrapes

Example sites:
- <a href="https://a816-health.nyc.gov/ABCEatsRestaurants/#!/Search">Restaurant Inspections</a>
- <a href="https://www1.nyc.gov/site/doh/health/health-topics/school-cafeteria-inspection-results.page">School cafeteria Inspections</a>
- <a href="https://restructuring.primeclerk.com/pge/Home-ClaimInfo">PG&E bankruptcy claims</a>
- <a href="https://eportal.miteco.gob.es/BoleHWeb/">Ministry for the Ecological Transition and the Demographic Challenge</a>

Before we even scrape anything, let's find the sources of the sites above.

## Scrape check cashing site

We want all the check cashing locations in New York City.

<a href="http://www.fscny.org/?controller=licensedlocations">Financial services check cashing</a>




In [1]:
## import libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
from random import randint

In [None]:
# school cafeteria inspections scrape
# jupyter doesn't understand cURL because it's a special language between the server and webpage

curl 'https://a816-repportal.nyc.gov/ReportingPortal/Resource/Results' \
  -H 'Accept: application/json, text/javascript, */*; q=0.01' \
  -H 'Accept-Language: en-US,en;q=0.9' \
  -H 'Connection: keep-alive' \
  -H 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8' \
  -H 'Origin: https://www.nyc.gov' \
  -H 'Referer: https://www.nyc.gov/' \
  -H 'Sec-Fetch-Dest: empty' \
  -H 'Sec-Fetch-Mode: cors' \
  -H 'Sec-Fetch-Site: same-site' \
  -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' \
  -H 'sec-ch-ua: "Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"' \
  -H 'sec-ch-ua-mobile: ?0' \
  -H 'sec-ch-ua-platform: "macOS"' \
  --data-raw 'json=%7B%22AppID%22%3A%223236%22%2C%22FuncAlias%22%3A%22schoolsmeta%22%7D' \
  --compressed

In [2]:
# JSON converted to python using curlconverter.com

import requests

headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'en-US,en;q=0.9',
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Origin': 'https://www.nyc.gov',
    'Referer': 'https://www.nyc.gov/',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
}

data = {
    'json': '{"AppID":"3236","FuncAlias":"schoolsmeta"}',
}

response = requests.post('https://a816-repportal.nyc.gov/ReportingPortal/Resource/Results', headers=headers, data=data)

In [5]:
df = pd.DataFrame(response.json())
df

Unnamed: 0,I,N,A,Z,L
0,54722,"4- P.S. 041 FRANCIS WHITE, FIRST STEP NYC @ PS...","411 THATFORD AVENUE, Brooklyn, NY 11212",11212,08/04/2023
1,53876,47 THE AMERICAN SIGN LANGUAGE AND ENGLISH SECO...,"223 EAST 23 STREET, New York, NY 10010",10010,07/17/2023
2,318998,A. FANTIS PAROCHIAL SCHOOL,"195 STATE STREET, Brooklyn, NY 11201",11201,12/13/2022
3,54130,A. PHILIP RANDOLPH CAMPUS HIGH SCHOOL,"135 CONVENT AVENUE, New York, NY 10031",10031,10/26/2023
4,324475,ABRAHAM JOSHUA HESCHEL SCHOOL,"30 WEST END AVENUE, New York, NY 10023",10023,05/15/2023
...,...,...,...,...,...
2188,377436,Zeta Charter School,"1325 JEROME AVE, BRONX, NY 10452",10452,06/05/2023
2189,379950,Zeta Charter School -South Bronx,"425 WESTCHESTER AVENUE, Bronx, NY 10455",10455,03/28/2023
2190,358890,ZETA CHARTER SCHOOL INWOOD I,"652 WEST 187 STREET, New York, NY 10033",10033,02/17/2023
2191,377434,ZETA CHARTER SCHOOLS,"1910 ARTHUR AVENUE, Bronx, NY 10457",10457,05/24/2023


In [12]:
# PG&E Claims

url = "https://restructuring.ra.kroll.com/pge/Home-LoadClaimData"

payload = "ClaimNumber=&ScheduleNumber=&CreditorName=&ConfirmationID=&TotalCurrentClaimAmount=Select+an+Option%7CSelect+an+Option%7C&Dates=%7C&ScopeValue=Claims&QuickSearch=&Deptors=0%C3%AA4025%C3%AA4026%C3%AA4093%C3%AA&fl=0&_search=false&nd=1699369770577&rows=200&page=1&sidx=CreditorName&sord=asc"
headers = {
  'authority': 'restructuring.ra.kroll.com',
  'accept': 'application/json, text/javascript, */*; q=0.01',
  'accept-language': 'en-US,en;q=0.9',
  'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
  'cookie': 'psi=76128135-aac0-421e-826a-85cb5feb4c2d; _ga=GA1.1.347398889.1698156310; LPVID=YyZTMwMzA5Mjc5MWFkMWE1; LPSID-16393053=kEEhpuMTRzeoHLLccTtPFQ; __cf_bm=7saE055HJCGExK_qlIvWaHVCVZrIv60.9aAIww7CY2E-1699369642-0-AWdqor1qLIEUrjrXj7li04Za7PCAbLmK3SYwZ5xHa3NrH3AEJIOCQXABXhVTgArGKzPMjHAXq2rSJDTiQmsZ0Vs=; aws-waf-token=535ea384-6fe0-43aa-8a48-65e469e7a162:EQoAiU1qMxYRAAAA:C5jHTHR0TuYMTvNJTFVAoCFTTB9cnVcci+bV86kB/5WIgIsPPKivl16eV72N11w1La+nPpXdJAkL1mO9MY5QbFrO5Th9ftrweAgT63wRypht/fYbj0IdPSs2exr78YZp5rnQLmAiLlGx8WcD6ZpmVEE0g9MJP2c+x9eeeHk3hg9MQEc31Uvp6c1ufFoRz5hr6keqm70HYDd/HwZtZwugIHCI0wP09vmVQdcWYE6ltr0UR5mT+emLFU4R5KcdHykIxhB5; AWSALBTG=DKLI3Oz1mINcT6S9xBIQAWwyOLh+R2cgiN3IU9+2FwgdJMgp1VMOM+ieD+jMjx8v1+D1XnLRPXQTY8sKd18Qm1d9NyFRrmue+aDtB1435lSYdlZOVhvLI8dMo/BSMgbekXJQ0CDgFgyNEFzTtmWCKSs7QFr9HFVN3bhjEFO/hYff; AWSALBTGCORS=DKLI3Oz1mINcT6S9xBIQAWwyOLh+R2cgiN3IU9+2FwgdJMgp1VMOM+ieD+jMjx8v1+D1XnLRPXQTY8sKd18Qm1d9NyFRrmue+aDtB1435lSYdlZOVhvLI8dMo/BSMgbekXJQ0CDgFgyNEFzTtmWCKSs7QFr9HFVN3bhjEFO/hYff; AWSALB=C+if3AULQ1FFuyhRT6E6sNlKBASGVuK4mwPSW/Tl0YemDQdm7tE9xRtaVNfpZLEJo8SM8+EEtbXuoN26OFkHxHsURIfY9Q3hR7OmZxnX4Jvi6DWNct/ISpWZhdhA; AWSALBCORS=C+if3AULQ1FFuyhRT6E6sNlKBASGVuK4mwPSW/Tl0YemDQdm7tE9xRtaVNfpZLEJo8SM8+EEtbXuoN26OFkHxHsURIfY9Q3hR7OmZxnX4Jvi6DWNct/ISpWZhdhA; _ga_D5N0JEWGPF=GS1.1.1699369447.2.1.1699369770.0.0.0; AWSALB=cJKBZpzzGzhl7bCorNUM62xtUMpCdgl6BON35gCFgigyq6OPMsCsq0sE9pgu06ek/yt5VkugUWkq9GbLtqUjO+HFee1b0IJhRTS0kmtVowKT8F4tFJ6d9oKStfJg; AWSALBCORS=cJKBZpzzGzhl7bCorNUM62xtUMpCdgl6BON35gCFgigyq6OPMsCsq0sE9pgu06ek/yt5VkugUWkq9GbLtqUjO+HFee1b0IJhRTS0kmtVowKT8F4tFJ6d9oKStfJg; AWSALBTG=6g6jWje7zC3k0tP6dOvRrFShy0YWDN0MsMCEZR96SgTdlmXE+SLWm5IpMlbKe/pnUGrvrf6Kny2uCxy683cXYudBC+kBgfm9MoxbKUVzjUVXC9R20xuEUABGcj2kRukuLcuyPRWAblp7F2YAmuGGlNdsupGVsacwprPYpxtagMvV; AWSALBTGCORS=6g6jWje7zC3k0tP6dOvRrFShy0YWDN0MsMCEZR96SgTdlmXE+SLWm5IpMlbKe/pnUGrvrf6Kny2uCxy683cXYudBC+kBgfm9MoxbKUVzjUVXC9R20xuEUABGcj2kRukuLcuyPRWAblp7F2YAmuGGlNdsupGVsacwprPYpxtagMvV',
  'origin': 'https://restructuring.ra.kroll.com',
  'referer': 'https://restructuring.ra.kroll.com/pge/Home-ClaimInfo',
  'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
  'sec-ch-ua-mobile': '?0',
  'sec-ch-ua-platform': '"macOS"',
  'sec-fetch-dest': 'empty',
  'sec-fetch-mode': 'cors',
  'sec-fetch-site': 'same-origin',
  'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
  'x-requested-with': 'XMLHttpRequest'
}

response = requests.request("POST", url, headers=headers, data=payload)


In [13]:
response.json()

{'total': 551,
 'page': 1,
 'records': 110159,
 'rows': [{'ClaimID': 1094637,
   'ScheduleNumber': "<b class='tablesaw-cell-label'>Schedule</b> <span class='tablesaw-cell-content'> </span>",
   'ClaimNumber': "<b class='tablesaw-cell-label'>Claim #</b><span class='tablesaw-cell-content'><a  onclick=ShowClaims('MTA5NDYzNw==') id='claim-form-open-4' class='link-claim' data-toggle='modal' data-target='modal-claim-form'>32910</a></span>",
   'DateFiled': "<b class='tablesaw-cell-label'>Filed Date</b><span class='tablesaw-cell-content'>10/17/2019</span>",
   'CreditorName': "<b class='tablesaw-cell-label'>Creditor Name</b><span class='tablesaw-cell-content'>'OHANA HEALTH, LLC</span>",
   'TotalCurrentClaimAmount': "<b class='tablesaw-cell-label'> Claim Value</b><span class='tablesaw-cell-content'> $0.00</span>",
   'DebtorName': "<b class='tablesaw-cell-label'>Debtor Name</b><span class='tablesaw-cell-content'>PG&E Corporation and Pacific Gas and Electric Company</span>"},
  {'ClaimID': 115

In [10]:
# PG&E Claims curl converter

import requests


cookies = {
    'psi': '76128135-aac0-421e-826a-85cb5feb4c2d',
    '_ga': 'GA1.1.347398889.1698156310',
    'LPVID': 'YyZTMwMzA5Mjc5MWFkMWE1',
    'LPSID-16393053': 'kEEhpuMTRzeoHLLccTtPFQ',
    '__cf_bm': '7saE055HJCGExK_qlIvWaHVCVZrIv60.9aAIww7CY2E-1699369642-0-AWdqor1qLIEUrjrXj7li04Za7PCAbLmK3SYwZ5xHa3NrH3AEJIOCQXABXhVTgArGKzPMjHAXq2rSJDTiQmsZ0Vs=',
    'aws-waf-token': '535ea384-6fe0-43aa-8a48-65e469e7a162:EQoAiU1qMxYRAAAA:C5jHTHR0TuYMTvNJTFVAoCFTTB9cnVcci+bV86kB/5WIgIsPPKivl16eV72N11w1La+nPpXdJAkL1mO9MY5QbFrO5Th9ftrweAgT63wRypht/fYbj0IdPSs2exr78YZp5rnQLmAiLlGx8WcD6ZpmVEE0g9MJP2c+x9eeeHk3hg9MQEc31Uvp6c1ufFoRz5hr6keqm70HYDd/HwZtZwugIHCI0wP09vmVQdcWYE6ltr0UR5mT+emLFU4R5KcdHykIxhB5',
    'AWSALBTG': 'DKLI3Oz1mINcT6S9xBIQAWwyOLh+R2cgiN3IU9+2FwgdJMgp1VMOM+ieD+jMjx8v1+D1XnLRPXQTY8sKd18Qm1d9NyFRrmue+aDtB1435lSYdlZOVhvLI8dMo/BSMgbekXJQ0CDgFgyNEFzTtmWCKSs7QFr9HFVN3bhjEFO/hYff',
    'AWSALBTGCORS': 'DKLI3Oz1mINcT6S9xBIQAWwyOLh+R2cgiN3IU9+2FwgdJMgp1VMOM+ieD+jMjx8v1+D1XnLRPXQTY8sKd18Qm1d9NyFRrmue+aDtB1435lSYdlZOVhvLI8dMo/BSMgbekXJQ0CDgFgyNEFzTtmWCKSs7QFr9HFVN3bhjEFO/hYff',
    'AWSALB': 'C+if3AULQ1FFuyhRT6E6sNlKBASGVuK4mwPSW/Tl0YemDQdm7tE9xRtaVNfpZLEJo8SM8+EEtbXuoN26OFkHxHsURIfY9Q3hR7OmZxnX4Jvi6DWNct/ISpWZhdhA',
    'AWSALBCORS': 'C+if3AULQ1FFuyhRT6E6sNlKBASGVuK4mwPSW/Tl0YemDQdm7tE9xRtaVNfpZLEJo8SM8+EEtbXuoN26OFkHxHsURIfY9Q3hR7OmZxnX4Jvi6DWNct/ISpWZhdhA',
    '_ga_D5N0JEWGPF': 'GS1.1.1699369447.2.1.1699369770.0.0.0',
}

headers = {
    'authority': 'restructuring.ra.kroll.com',
    'accept': 'application/json, text/javascript, */*; q=0.01',
    'accept-language': 'en-US,en;q=0.9',
    'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
    # 'cookie': 'psi=76128135-aac0-421e-826a-85cb5feb4c2d; _ga=GA1.1.347398889.1698156310; LPVID=YyZTMwMzA5Mjc5MWFkMWE1; LPSID-16393053=kEEhpuMTRzeoHLLccTtPFQ; __cf_bm=7saE055HJCGExK_qlIvWaHVCVZrIv60.9aAIww7CY2E-1699369642-0-AWdqor1qLIEUrjrXj7li04Za7PCAbLmK3SYwZ5xHa3NrH3AEJIOCQXABXhVTgArGKzPMjHAXq2rSJDTiQmsZ0Vs=; aws-waf-token=535ea384-6fe0-43aa-8a48-65e469e7a162:EQoAiU1qMxYRAAAA:C5jHTHR0TuYMTvNJTFVAoCFTTB9cnVcci+bV86kB/5WIgIsPPKivl16eV72N11w1La+nPpXdJAkL1mO9MY5QbFrO5Th9ftrweAgT63wRypht/fYbj0IdPSs2exr78YZp5rnQLmAiLlGx8WcD6ZpmVEE0g9MJP2c+x9eeeHk3hg9MQEc31Uvp6c1ufFoRz5hr6keqm70HYDd/HwZtZwugIHCI0wP09vmVQdcWYE6ltr0UR5mT+emLFU4R5KcdHykIxhB5; AWSALBTG=DKLI3Oz1mINcT6S9xBIQAWwyOLh+R2cgiN3IU9+2FwgdJMgp1VMOM+ieD+jMjx8v1+D1XnLRPXQTY8sKd18Qm1d9NyFRrmue+aDtB1435lSYdlZOVhvLI8dMo/BSMgbekXJQ0CDgFgyNEFzTtmWCKSs7QFr9HFVN3bhjEFO/hYff; AWSALBTGCORS=DKLI3Oz1mINcT6S9xBIQAWwyOLh+R2cgiN3IU9+2FwgdJMgp1VMOM+ieD+jMjx8v1+D1XnLRPXQTY8sKd18Qm1d9NyFRrmue+aDtB1435lSYdlZOVhvLI8dMo/BSMgbekXJQ0CDgFgyNEFzTtmWCKSs7QFr9HFVN3bhjEFO/hYff; AWSALB=C+if3AULQ1FFuyhRT6E6sNlKBASGVuK4mwPSW/Tl0YemDQdm7tE9xRtaVNfpZLEJo8SM8+EEtbXuoN26OFkHxHsURIfY9Q3hR7OmZxnX4Jvi6DWNct/ISpWZhdhA; AWSALBCORS=C+if3AULQ1FFuyhRT6E6sNlKBASGVuK4mwPSW/Tl0YemDQdm7tE9xRtaVNfpZLEJo8SM8+EEtbXuoN26OFkHxHsURIfY9Q3hR7OmZxnX4Jvi6DWNct/ISpWZhdhA; _ga_D5N0JEWGPF=GS1.1.1699369447.2.1.1699369770.0.0.0',
    'origin': 'https://restructuring.ra.kroll.com',
    'referer': 'https://restructuring.ra.kroll.com/pge/Home-ClaimInfo',
    'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest',
}

data = {
    'ClaimNumber': '',
    'ScheduleNumber': '',
    'CreditorName': '',
    'ConfirmationID': '',
    'TotalCurrentClaimAmount': 'Select an Option|Select an Option|',
    'Dates': '|',
    'ScopeValue': 'Claims',
    'QuickSearch': '',
    'Deptors': '0ê4025ê4026ê4093ê',
    'fl': '0',
    '_search': 'false',
    'nd': '1699369770577',
    'rows': '20',
    'page': page_number,
    'sidx': 'CreditorName',
    'sord': 'asc',
}

response = requests.post('https://restructuring.ra.kroll.com/pge/Home-LoadClaimData', cookies=cookies, headers=headers, data=data)

In [11]:
response.json()

{'total': 5508,
 'page': 2,
 'records': 110159,
 'rows': [{'ClaimID': 1138576,
   'ScheduleNumber': "<b class='tablesaw-cell-label'>Schedule</b> <span class='tablesaw-cell-content'> </span>",
   'ClaimNumber': "<b class='tablesaw-cell-label'>Claim #</b><span class='tablesaw-cell-content'><a  onclick=ShowClaims('MTEzODU3Ng==') id='claim-form-open-4' class='link-claim' data-toggle='modal' data-target='modal-claim-form'>73166</a></span>",
   'DateFiled': "<b class='tablesaw-cell-label'>Filed Date</b><span class='tablesaw-cell-content'>10/18/2019</span>",
   'CreditorName': "<b class='tablesaw-cell-label'>Creditor Name</b><span class='tablesaw-cell-content'>1st Class Limos LLC</span>",
   'TotalCurrentClaimAmount': "<b class='tablesaw-cell-label'> Claim Value</b><span class='tablesaw-cell-content'> $0.00</span>",
   'DebtorName': "<b class='tablesaw-cell-label'>Debtor Name</b><span class='tablesaw-cell-content'>PG&E Corporation and Pacific Gas and Electric Company</span>"},
  {'ClaimID': 1

In [21]:
# restaurant inspection
url_placeholder = 'https://a816-health.nyc.gov/ABCEatsRestaurants/App/GetEntitiesByBoro/'
boros = ["Staten%20Island", "Manhattan","Brooklyn","Queens","Bronx"]
         
for boro in boros[:1]:
         url = f"{url_placeholder}{boro}"
         print(url)
         response= requests.get(url)
         print(response.json())

https://a816-health.nyc.gov/ABCEatsRestaurants/App/GetEntitiesByBoro/Staten%20Island
[{'EntityName': '1001 NIGHTS CAFE', 'Cuisine': 'American', 'CurrentDecalNumber': '41456477', 'MostRecentVendingBoro': 'Staten Island', 'MostRecentZipCode': '10314', 'MostRecent_Longitude': -74.162543418612, 'MostRecent_Latitude': 40.598416036854, 'Grade': 'A', 'MostRecentVendingLocation': '2025 RICHMOND AVENUE'}, {'EntityName': '120 BAY CAFE/CARGO', 'Cuisine': 'American', 'CurrentDecalNumber': '41646201', 'MostRecentVendingBoro': 'Staten Island', 'MostRecentZipCode': '10301', 'MostRecent_Longitude': -74.075648310405, 'MostRecent_Latitude': 40.639156709708, 'Grade': 'A', 'MostRecentVendingLocation': '120 BAY STREET'}, {'EntityName': '1847 INTERNATIONAL', 'Cuisine': 'African', 'CurrentDecalNumber': '50071182', 'MostRecentVendingBoro': 'Staten Island', 'MostRecentZipCode': '10304', 'MostRecent_Longitude': -74.075493332566, 'MostRecent_Latitude': 40.626429249769, 'Grade': 'A', 'MostRecentVendingLocation': 

In [22]:
df_si = pd.DataFrame(response.json())
df_si

Unnamed: 0,EntityName,Cuisine,CurrentDecalNumber,MostRecentVendingBoro,MostRecentZipCode,MostRecent_Longitude,MostRecent_Latitude,Grade,MostRecentVendingLocation
0,1001 NIGHTS CAFE,American,41456477,Staten Island,10314,-74.162543,40.598416,A,2025 RICHMOND AVENUE
1,120 BAY CAFE/CARGO,American,41646201,Staten Island,10301,-74.075648,40.639157,A,120 BAY STREET
2,1847 INTERNATIONAL,African,50071182,Staten Island,10304,-74.075493,40.626429,A,639 BAY STREET
3,286 SOUTH RESTAURANT & LOUNGE,Other,50133395,Staten Island,10309,-74.238708,40.521222,Not Yet Graded,286 RICHMOND VALLEY ROAD
4,290 WILD BAR,American,50067554,Staten Island,10314,-74.191358,40.587008,A,290 WILD AVENUE
...,...,...,...,...,...,...,...,...,...
1026,ZARA CAFE GRILL,Turkish,50070120,Staten Island,10306,-74.098266,40.581384,A,1995 HYLAN BOULEVARD
1027,ZARA FOREST,Turkish,50133239,Staten Island,10303,-74.147129,40.624954,A,1745 FOREST AVENUE
1028,ZINGER HALAL EXPRESS,Chicken,50141303,Staten Island,10314,-74.129482,40.612539,A,1970 VICTORY BOULEVARD
1029,ZIOTOTO RESTAURANTE & BAR,Italian,41645548,Staten Island,10309,-74.235009,40.522059,A,86 PAGE AVENUE


In [31]:
import requests

url = "https://restructuring.ra.kroll.com/pge/Home-LoadClaimData"

payload = "ClaimNumber=&ScheduleNumber=&CreditorName=&ConfirmationID=&TotalCurrentClaimAmount=Select+an+Option%7CSelect+an+Option%7C&Dates=%7C&ScopeValue=Claims&QuickSearch=&Deptors=0%C3%AA4025%C3%AA4026%C3%AA4093%C3%AA&fl=0&_search=false&nd=1699375344382&rows=20&page=1&sidx=CreditorName&sord=asc"
headers = {
  'authority': 'restructuring.ra.kroll.com',
  'accept': 'application/json, text/javascript, */*; q=0.01',
  'accept-language': 'en-US,en;q=0.9',
  'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
  'cookie': 'psi=76128135-aac0-421e-826a-85cb5feb4c2d; _ga=GA1.1.347398889.1698156310; LPVID=YyZTMwMzA5Mjc5MWFkMWE1; LPSID-16393053=WUz8dlviTX2aEaxUkq__VQ; aws-waf-token=535ea384-6fe0-43aa-8a48-65e469e7a162:EQoAb8R026wXAAAA:1oLi31BaqFSB3nnlCMyM8UZ4eccjLXMSOGnVer2WHB93SxM2DBKrj8LHNOo0YbpOa3zNkP7uKU5FGdy2jIzswU07/1ptc5vD0KEpF+rABLL/xFdyRhGJiUhT/78p3UbUFDyh/eYqvHKfHfsYIogStBu5+kgU6FUikFYr8g1HbaOiTHeWrGZhcQwrB4ULRbC8dL/NaPwQgI4c4x0hMrfV8EJ2BFugrYBVl/xXptXzhVOmFOsnJfi+RFmfuP/Qxb9cg3uW; AWSALBTG=uQJBvNpvi2DOUA7o1MjPlztyUSvSg+qY4jPDAcWBbq7tGvyJIyMEwjpozc3QzIw2tPCzaBHWrkt43WJuvza3zrvMCGXkcm5vfyRDO/dl6O2cUL7ctTUMhgluO8iplVOTQb7gJQd4t/hLogyAeNNV7UiI+eNpqeIC1k6pG/zPxkem; AWSALBTGCORS=uQJBvNpvi2DOUA7o1MjPlztyUSvSg+qY4jPDAcWBbq7tGvyJIyMEwjpozc3QzIw2tPCzaBHWrkt43WJuvza3zrvMCGXkcm5vfyRDO/dl6O2cUL7ctTUMhgluO8iplVOTQb7gJQd4t/hLogyAeNNV7UiI+eNpqeIC1k6pG/zPxkem; AWSALB=61pBNmJp3GpQZHOTyBiwsjWJ0uKJc0j9okSPCGals9IgFM3trVoX7IHSGXQdRqrG8LzUVjHeaSb3VH6QXO6fmbzzdYGojuS9CDYhNK5Qw2Wd1WshcepcxjgEYH0M; AWSALBCORS=61pBNmJp3GpQZHOTyBiwsjWJ0uKJc0j9okSPCGals9IgFM3trVoX7IHSGXQdRqrG8LzUVjHeaSb3VH6QXO6fmbzzdYGojuS9CDYhNK5Qw2Wd1WshcepcxjgEYH0M; _ga_D5N0JEWGPF=GS1.1.1699374027.3.1.1699375344.0.0.0; AWSALB=nbGGLKdGe5z2rhOeoOAsL5Xb4Mwup5Hh61I0XOC4hKfdy/CCJMZxZzFpFuluVCJ4xrJVzVM6z0gpISYLbw53+gbJ4BHKfl8qigq4QXw9s5fR+cdeX1/Up9mDUlpn; AWSALBCORS=nbGGLKdGe5z2rhOeoOAsL5Xb4Mwup5Hh61I0XOC4hKfdy/CCJMZxZzFpFuluVCJ4xrJVzVM6z0gpISYLbw53+gbJ4BHKfl8qigq4QXw9s5fR+cdeX1/Up9mDUlpn; AWSALBTG=eqhPfPMXNbhPV69zGqi1YM1Hg3zki9Y/VCOruPFnjs8vNi/Mb7xCF6JHrV7oL3ft/ZR/0SaL0Ebkb/8gtDwEKsMK5UhmTK6wog3P8Ib7s5r50CKddveSfq51ccBa0V+QBjOUIRkmfBjt+65gAWLAOvrsUph3G67GQflaLL6WhXoZ; AWSALBTGCORS=eqhPfPMXNbhPV69zGqi1YM1Hg3zki9Y/VCOruPFnjs8vNi/Mb7xCF6JHrV7oL3ft/ZR/0SaL0Ebkb/8gtDwEKsMK5UhmTK6wog3P8Ib7s5r50CKddveSfq51ccBa0V+QBjOUIRkmfBjt+65gAWLAOvrsUph3G67GQflaLL6WhXoZ',
  'origin': 'https://restructuring.ra.kroll.com',
  'referer': 'https://restructuring.ra.kroll.com/pge/Home-ClaimInfo',
  'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
  'sec-ch-ua-mobile': '?0',
  'sec-ch-ua-platform': '"macOS"',
  'sec-fetch-dest': 'empty',
  'sec-fetch-mode': 'cors',
  'sec-fetch-site': 'same-origin',
  'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
  'x-requested-with': 'XMLHttpRequest'
}

response = requests.request("POST", url, headers=headers, data=payload)


In [32]:
response.json()

{'total': 5508,
 'page': 1,
 'records': 110159,
 'rows': [{'ClaimID': 1094637,
   'ScheduleNumber': "<b class='tablesaw-cell-label'>Schedule</b> <span class='tablesaw-cell-content'> </span>",
   'ClaimNumber': "<b class='tablesaw-cell-label'>Claim #</b><span class='tablesaw-cell-content'><a  onclick=ShowClaims('MTA5NDYzNw==') id='claim-form-open-4' class='link-claim' data-toggle='modal' data-target='modal-claim-form'>32910</a></span>",
   'DateFiled': "<b class='tablesaw-cell-label'>Filed Date</b><span class='tablesaw-cell-content'>10/17/2019</span>",
   'CreditorName': "<b class='tablesaw-cell-label'>Creditor Name</b><span class='tablesaw-cell-content'>'OHANA HEALTH, LLC</span>",
   'TotalCurrentClaimAmount': "<b class='tablesaw-cell-label'> Claim Value</b><span class='tablesaw-cell-content'> $0.00</span>",
   'DebtorName': "<b class='tablesaw-cell-label'>Debtor Name</b><span class='tablesaw-cell-content'>PG&E Corporation and Pacific Gas and Electric Company</span>"},
  {'ClaimID': 11

In [34]:
df = pd.DataFrame(response.json().get('rows'))
df

Unnamed: 0,ClaimID,ScheduleNumber,ClaimNumber,DateFiled,CreditorName,TotalCurrentClaimAmount,DebtorName
0,1094637,<b class='tablesaw-cell-label'>Schedule</b> <s...,<b class='tablesaw-cell-label'>Claim #</b><spa...,<b class='tablesaw-cell-label'>Filed Date</b><...,<b class='tablesaw-cell-label'>Creditor Name</...,<b class='tablesaw-cell-label'> Claim Value</b...,<b class='tablesaw-cell-label'>Debtor Name</b>...
1,1150032,<b class='tablesaw-cell-label'>Schedule</b> <s...,<b class='tablesaw-cell-label'>Claim #</b><spa...,<b class='tablesaw-cell-label'>Filed Date</b><...,<b class='tablesaw-cell-label'>Creditor Name</...,<b class='tablesaw-cell-label'> Claim Value</b...,<b class='tablesaw-cell-label'>Debtor Name</b>...
2,1117014,<b class='tablesaw-cell-label'>Schedule</b> <s...,<b class='tablesaw-cell-label'>Claim #</b><spa...,<b class='tablesaw-cell-label'>Filed Date</b><...,<b class='tablesaw-cell-label'>Creditor Name</...,<b class='tablesaw-cell-label'> Claim Value</b...,<b class='tablesaw-cell-label'>Debtor Name</b>...
3,1130194,<b class='tablesaw-cell-label'>Schedule</b> <s...,<b class='tablesaw-cell-label'>Claim #</b><spa...,<b class='tablesaw-cell-label'>Filed Date</b><...,<b class='tablesaw-cell-label'>Creditor Name</...,<b class='tablesaw-cell-label'> Claim Value</b...,<b class='tablesaw-cell-label'>Debtor Name</b>...
4,1147535,<b class='tablesaw-cell-label'>Schedule</b> <s...,<b class='tablesaw-cell-label'>Claim #</b><spa...,<b class='tablesaw-cell-label'>Filed Date</b><...,<b class='tablesaw-cell-label'>Creditor Name</...,<b class='tablesaw-cell-label'> Claim Value</b...,<b class='tablesaw-cell-label'>Debtor Name</b>...
5,1116047,<b class='tablesaw-cell-label'>Schedule</b> <s...,<b class='tablesaw-cell-label'>Claim #</b><spa...,<b class='tablesaw-cell-label'>Filed Date</b><...,<b class='tablesaw-cell-label'>Creditor Name</...,<b class='tablesaw-cell-label'> Claim Value</b...,<b class='tablesaw-cell-label'>Debtor Name</b>...
6,1163389,<b class='tablesaw-cell-label'>Schedule</b> <s...,<b class='tablesaw-cell-label'>Claim #</b><spa...,<b class='tablesaw-cell-label'>Filed Date</b><...,<b class='tablesaw-cell-label'>Creditor Name</...,<b class='tablesaw-cell-label'> Claim Value</b...,<b class='tablesaw-cell-label'>Debtor Name</b>...
7,3210322,<b class='tablesaw-cell-label'>Schedule</b> <s...,<b class='tablesaw-cell-label'>Claim #</b><spa...,<b class='tablesaw-cell-label'>Filed Date</b><...,<b class='tablesaw-cell-label'>Creditor Name</...,<b class='tablesaw-cell-label'> Claim Value</b...,<b class='tablesaw-cell-label'>Debtor Name</b>...
8,1128353,<b class='tablesaw-cell-label'>Schedule</b> <s...,<b class='tablesaw-cell-label'>Claim #</b><spa...,<b class='tablesaw-cell-label'>Filed Date</b><...,<b class='tablesaw-cell-label'>Creditor Name</...,<b class='tablesaw-cell-label'> Claim Value</b...,<b class='tablesaw-cell-label'>Debtor Name</b>...
9,1125096,<b class='tablesaw-cell-label'>Schedule</b> <s...,<b class='tablesaw-cell-label'>Claim #</b><spa...,<b class='tablesaw-cell-label'>Filed Date</b><...,<b class='tablesaw-cell-label'>Creditor Name</...,<b class='tablesaw-cell-label'> Claim Value</b...,<b class='tablesaw-cell-label'>Debtor Name</b>...


In [None]:
# full content with truncated value
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [35]:
def strip_html(target):
    target= str(target)
    soup = BeautifulSoup(target, "html.parser")
    return (soup.find("span", class_="tablesaw-cell-content").get_text())

In [38]:
target_cols = list(df.columns)[1:]
target_cols


['ScheduleNumber',
 'ClaimNumber',
 'DateFiled',
 'CreditorName',
 'TotalCurrentClaimAmount',
 'DebtorName']

In [43]:
# we want target_cols contents, it's going through and pulling out one column name at a time, 
# applying functions to each value in a cell, holding onto it as x,
# stripping it and placing it back in the table

for col in target_cols:
    df[col] = df[col].apply(lambda x: strip_html(x))
df

Unnamed: 0,ClaimID,ScheduleNumber,ClaimNumber,DateFiled,CreditorName,TotalCurrentClaimAmount,DebtorName
0,1094637,,32910,10/17/2019,"'OHANA HEALTH, LLC",$0.00,PG&E Corporation and Pacific Gas and Electric ...
1,1150032,,84461,10/21/2019,"(Allen) Smith, Julia",$0.00,PG&E Corporation and Pacific Gas and Electric ...
2,1117014,,55274,10/10/2019,"(Davis) Gerspacher, Abigail N.",$0.00,PG&E Corporation and Pacific Gas and Electric ...
3,1130194,,64849,10/14/2019,"(Gleeson) Augusto, Maureen T.",$0.00,PG&E Corporation and Pacific Gas and Electric ...
4,1147535,,82079,10/21/2019,"(Summer Burns and Kelly Burns), R.K.B., a Minor",$0.00,PG&E Corporation and Pacific Gas and Electric ...
5,1116047,,54315,10/20/2019,"0 Quietwater Ridge, LLC",$0.00,PG&E Corporation and Pacific Gas and Electric ...
6,1163389,,93858,12/31/2019,"1-800-Radiator of Fairfield, Inc","$17,540.00",PG&E Corporation and Pacific Gas and Electric ...
7,3210322,,106354,08/13/2020,1169 Euclid Avenue LLC,$0.00,PG&E Corporation and Pacific Gas and Electric ...
8,1128353,,63039,10/14/2019,1368 Garnet LLC,$0.00,PG&E Corporation and Pacific Gas and Electric ...
9,1125096,,59790,10/16/2019,14341 Tomki Trust,$0.00,PG&E Corporation and Pacific Gas and Electric ...


In [45]:
# multi-page scrape
df_list = []

for page in range(1,4):
    cookies = {
        'psi': '76128135-aac0-421e-826a-85cb5feb4c2d',
        '_ga': 'GA1.1.347398889.1698156310',
        'LPVID': 'YyZTMwMzA5Mjc5MWFkMWE1',
        'LPSID-16393053': 'WUz8dlviTX2aEaxUkq__VQ',
        'aws-waf-token': '535ea384-6fe0-43aa-8a48-65e469e7a162:EQoAb8R026wXAAAA:1oLi31BaqFSB3nnlCMyM8UZ4eccjLXMSOGnVer2WHB93SxM2DBKrj8LHNOo0YbpOa3zNkP7uKU5FGdy2jIzswU07/1ptc5vD0KEpF+rABLL/xFdyRhGJiUhT/78p3UbUFDyh/eYqvHKfHfsYIogStBu5+kgU6FUikFYr8g1HbaOiTHeWrGZhcQwrB4ULRbC8dL/NaPwQgI4c4x0hMrfV8EJ2BFugrYBVl/xXptXzhVOmFOsnJfi+RFmfuP/Qxb9cg3uW',
        'AWSALBTG': 'uQJBvNpvi2DOUA7o1MjPlztyUSvSg+qY4jPDAcWBbq7tGvyJIyMEwjpozc3QzIw2tPCzaBHWrkt43WJuvza3zrvMCGXkcm5vfyRDO/dl6O2cUL7ctTUMhgluO8iplVOTQb7gJQd4t/hLogyAeNNV7UiI+eNpqeIC1k6pG/zPxkem',
        'AWSALBTGCORS': 'uQJBvNpvi2DOUA7o1MjPlztyUSvSg+qY4jPDAcWBbq7tGvyJIyMEwjpozc3QzIw2tPCzaBHWrkt43WJuvza3zrvMCGXkcm5vfyRDO/dl6O2cUL7ctTUMhgluO8iplVOTQb7gJQd4t/hLogyAeNNV7UiI+eNpqeIC1k6pG/zPxkem',
        'AWSALB': '61pBNmJp3GpQZHOTyBiwsjWJ0uKJc0j9okSPCGals9IgFM3trVoX7IHSGXQdRqrG8LzUVjHeaSb3VH6QXO6fmbzzdYGojuS9CDYhNK5Qw2Wd1WshcepcxjgEYH0M',
        'AWSALBCORS': '61pBNmJp3GpQZHOTyBiwsjWJ0uKJc0j9okSPCGals9IgFM3trVoX7IHSGXQdRqrG8LzUVjHeaSb3VH6QXO6fmbzzdYGojuS9CDYhNK5Qw2Wd1WshcepcxjgEYH0M',
        '_ga_D5N0JEWGPF': 'GS1.1.1699374027.3.1.1699375344.0.0.0',
    }

    headers = {
        'authority': 'restructuring.ra.kroll.com',
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'accept-language': 'en-US,en;q=0.9',
        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
        # 'cookie': 'psi=76128135-aac0-421e-826a-85cb5feb4c2d; _ga=GA1.1.347398889.1698156310; LPVID=YyZTMwMzA5Mjc5MWFkMWE1; LPSID-16393053=WUz8dlviTX2aEaxUkq__VQ; aws-waf-token=535ea384-6fe0-43aa-8a48-65e469e7a162:EQoAb8R026wXAAAA:1oLi31BaqFSB3nnlCMyM8UZ4eccjLXMSOGnVer2WHB93SxM2DBKrj8LHNOo0YbpOa3zNkP7uKU5FGdy2jIzswU07/1ptc5vD0KEpF+rABLL/xFdyRhGJiUhT/78p3UbUFDyh/eYqvHKfHfsYIogStBu5+kgU6FUikFYr8g1HbaOiTHeWrGZhcQwrB4ULRbC8dL/NaPwQgI4c4x0hMrfV8EJ2BFugrYBVl/xXptXzhVOmFOsnJfi+RFmfuP/Qxb9cg3uW; AWSALBTG=uQJBvNpvi2DOUA7o1MjPlztyUSvSg+qY4jPDAcWBbq7tGvyJIyMEwjpozc3QzIw2tPCzaBHWrkt43WJuvza3zrvMCGXkcm5vfyRDO/dl6O2cUL7ctTUMhgluO8iplVOTQb7gJQd4t/hLogyAeNNV7UiI+eNpqeIC1k6pG/zPxkem; AWSALBTGCORS=uQJBvNpvi2DOUA7o1MjPlztyUSvSg+qY4jPDAcWBbq7tGvyJIyMEwjpozc3QzIw2tPCzaBHWrkt43WJuvza3zrvMCGXkcm5vfyRDO/dl6O2cUL7ctTUMhgluO8iplVOTQb7gJQd4t/hLogyAeNNV7UiI+eNpqeIC1k6pG/zPxkem; AWSALB=61pBNmJp3GpQZHOTyBiwsjWJ0uKJc0j9okSPCGals9IgFM3trVoX7IHSGXQdRqrG8LzUVjHeaSb3VH6QXO6fmbzzdYGojuS9CDYhNK5Qw2Wd1WshcepcxjgEYH0M; AWSALBCORS=61pBNmJp3GpQZHOTyBiwsjWJ0uKJc0j9okSPCGals9IgFM3trVoX7IHSGXQdRqrG8LzUVjHeaSb3VH6QXO6fmbzzdYGojuS9CDYhNK5Qw2Wd1WshcepcxjgEYH0M; _ga_D5N0JEWGPF=GS1.1.1699374027.3.1.1699375344.0.0.0',
        'origin': 'https://restructuring.ra.kroll.com',
        'referer': 'https://restructuring.ra.kroll.com/pge/Home-ClaimInfo',
        'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
        'x-requested-with': 'XMLHttpRequest',
    }

    data = {
        'ClaimNumber': '',
        'ScheduleNumber': '',
        'CreditorName': '',
        'ConfirmationID': '',
        'TotalCurrentClaimAmount': 'Select an Option|Select an Option|',
        'Dates': '|',
        'ScopeValue': 'Claims',
        'QuickSearch': '',
        'Deptors': '0ê4025ê4026ê4093ê',
        'fl': '0',
        '_search': 'false',
        'nd': '1699375344382',
        'rows': '20',
        'page': page,
        'sidx': 'CreditorName',
        'sord': 'asc',
    }

    response = requests.post('https://restructuring.ra.kroll.com/pge/Home-LoadClaimData', cookies=cookies, headers=headers, data=data)
    #     print(page)= gives numbers 1,2,3
    df = pd.DataFrame(response.json().get('rows'))
    target_cols = list(df.columns)[1:]
    for col in target_cols:
        df[col] = df[col].apply(lambda x: strip_html(x))
    df_list.append(df)
    snooze = randint(5,15)
    print(f"Snoozing for {snooze} seconds.")
    time.sleep(snooze)

print("DONE SCRAPING!")

Snoozing for 9 seconds.
Snoozing for 10 seconds.
Snoozing for 12 seconds.
DONE SCRAPING!


In [49]:
df_final = pd.concat(df_list).reset_index(drop=True)
df_final

Unnamed: 0,ClaimID,ScheduleNumber,ClaimNumber,DateFiled,CreditorName,TotalCurrentClaimAmount,DebtorName
0,1094637,,32910,10/17/2019,"'OHANA HEALTH, LLC",$0.00,PG&E Corporation and Pacific Gas and Electric ...
1,1150032,,84461,10/21/2019,"(Allen) Smith, Julia",$0.00,PG&E Corporation and Pacific Gas and Electric ...
2,1117014,,55274,10/10/2019,"(Davis) Gerspacher, Abigail N.",$0.00,PG&E Corporation and Pacific Gas and Electric ...
3,1130194,,64849,10/14/2019,"(Gleeson) Augusto, Maureen T.",$0.00,PG&E Corporation and Pacific Gas and Electric ...
4,1147535,,82079,10/21/2019,"(Summer Burns and Kelly Burns), R.K.B., a Minor",$0.00,PG&E Corporation and Pacific Gas and Electric ...
5,1116047,,54315,10/20/2019,"0 Quietwater Ridge, LLC",$0.00,PG&E Corporation and Pacific Gas and Electric ...
6,1163389,,93858,12/31/2019,"1-800-Radiator of Fairfield, Inc","$17,540.00",PG&E Corporation and Pacific Gas and Electric ...
7,3210322,,106354,08/13/2020,1169 Euclid Avenue LLC,$0.00,PG&E Corporation and Pacific Gas and Electric ...
8,1128353,,63039,10/14/2019,1368 Garnet LLC,$0.00,PG&E Corporation and Pacific Gas and Electric ...
9,1125096,,59790,10/16/2019,14341 Tomki Trust,$0.00,PG&E Corporation and Pacific Gas and Electric ...
