In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import re

In [2]:
def to_snakecase(string, regex):
    return '_'.join(regex.sub('_' , string).split('_')).strip('_').lower()

In [3]:
def get_cargo(soup):
    data = {}
    im = re.compile(r'\W+')
    cargo = soup.find("ul", attrs={"class": "cargo"}).find_all('li')
    for i in cargo:
        active = "checked" in i.get('class')
        val = i.text
        if active:
            val = val.lstrip('X')
        data[to_snakecase(val, im)] = active
    return data

In [4]:
def get_vehicle_type(soup):
    data = []
    im = re.compile(r'\W+')
    cargo = soup.find('table')
    headers = []
    for th in cargo.thead.find_all('th'):
        headers.append(to_snakecase(th.text, im))
    for row in cargo.tbody.find_all('tr'):
        row_data = {}
        row_items = row.text.strip().split('\n')
        for item, header in zip(row_items, headers):
            pos_int = to_snakecase(item, im)
            try:
                pos_int = int(pos_int)
            except ValueError:
                pass
            row_data[header] = pos_int
        data.append(row_data)
    return data

In [5]:
def get_carrier_registration(carrier_id):
    page = requests.get('https://ai.fmcsa.dot.gov/SMS/Carrier/{}/CarrierRegistration.aspx'.format(carrier_id)).text
    soup = BeautifulSoup(page)
    return {'carrier_id': carrier_id, 'cargo': get_cargo(soup), 'types': get_vehicle_type(soup)}

In [6]:
get_carrier_registration(21800)

{'carrier_id': 21800,
 'cargo': {'general_freight': True,
  'household_goods': False,
  'metal_sheets_coils_rolls': False,
  'motor_vehicles': False,
  'drive_away_towaway': False,
  'logs_poles_beams_lumber': False,
  'building_materials': False,
  'mobile_homes': False,
  'machinery_large_objects': False,
  'fresh_produce': False,
  'liquids_gases': False,
  'intermodal_containers': False,
  'passengers': False,
  'oil_field_equipment': False,
  'livestock': False,
  'grain_feed_hay': False,
  'coal_coke': False,
  'meat': False,
  'garbage_refuse_trash': False,
  'u_s_mail': False,
  'chemicals': False,
  'commodities_dry_bulk': False,
  'refrigerated_food': False,
  'beverages': False,
  'paper_products': False,
  'utility': False,
  'farm_supplies': False,
  'construction': False,
  'water_well': False,
  'other': False},
 'types': [{'vehicle_type': 'straight_trucks',
   'owned': 98785,
   'term_leased': 0,
   'trip_leased': 0},
  {'vehicle_type': 'truck_tractors',
   'owned': 271

In [7]:
json.dumps(get_carrier_registration(21800))

'{"carrier_id": 21800, "cargo": {"general_freight": true, "household_goods": false, "metal_sheets_coils_rolls": false, "motor_vehicles": false, "drive_away_towaway": false, "logs_poles_beams_lumber": false, "building_materials": false, "mobile_homes": false, "machinery_large_objects": false, "fresh_produce": false, "liquids_gases": false, "intermodal_containers": false, "passengers": false, "oil_field_equipment": false, "livestock": false, "grain_feed_hay": false, "coal_coke": false, "meat": false, "garbage_refuse_trash": false, "u_s_mail": false, "chemicals": false, "commodities_dry_bulk": false, "refrigerated_food": false, "beverages": false, "paper_products": false, "utility": false, "farm_supplies": false, "construction": false, "water_well": false, "other": false}, "types": [{"vehicle_type": "straight_trucks", "owned": 98785, "term_leased": 0, "trip_leased": 0}, {"vehicle_type": "truck_tractors", "owned": 27164, "term_leased": 0, "trip_leased": 0}, {"vehicle_type": "trailers", "ow