In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm import tqdm
LIST_URL = "https://www.lroc.asu.edu/atlases/pits/list"
BASE_URL = "https://www.lroc.asu.edu/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

response = requests.get(LIST_URL, headers=HEADERS)

if response.status_code == 200:
    print("Successfully fetched the webpage!")
else:
    print(f"Failed to fetch webpage. Status code: {response.status_code}")
    exit()

Successfully fetched the webpage!


In [None]:
soup = BeautifulSoup(response.content, "html.parser")
table = soup.find("table", {"id": "pitsTable"})

if not table:
    raise ValueError("Table with ID 'pitsTable' not found in the HTML content.")

expected_headers = ['Host Feat.', 'Name', 'Lat.', 'Long.', 'Funnel Max Diam. (m)', 'Funnel Min Diam. (m)',
                    'Inner Max Diam. (m)', 'Inner Max Diam. Sorting', 'Inner Min Diam. (m)',
                    'Inner Min Diam. Sorting', 'Azimuth', 'Depth (m)', 'Depth Sorting']
headers = [header.text.strip() for header in table.find("thead").find_all("th")]
# Here we added link suffix to keep the information
MONGO_HEADERS = ['hosting_feature', 'name', 'latitude', 'longitude', 'funnel_max_diameter', 'funnel_min_diameter',
                 'inner_max_diameter', 'inner_max_diameter_sorting', 'inner_min_diameter', 'inner_min_diameter_sorting',
                 'azimuth', 'depth', 'depth_sorting', 'link_suffix']

if headers != expected_headers:
    raise ValueError("Table headers have changed. Please update the script.")


rows = table.find("tbody").find_all("tr")
data = []

for row in rows:
    cells = row.find_all("td")
    cell_data = []
    object_link = None

    for cell in cells:
        # Check if the cell contains a link, otherwise get the text
        link = cell.find("a")
        if link:
            # Store both the visible text and the link
            cell_data.append(cell.text.strip())
            object_link = link["href"]
        else:
            # Store only the text if no link is found
            cell_data.append(cell.text.strip())
    # Add the link at the end, since the header was also appended
    cell_data.append(object_link)
    data.append(cell_data)

# Convert to a pandas DataFrame
general_df = pd.DataFrame(data, columns=MONGO_HEADERS)
detail_data = []
image_data = []

for index, row in general_df.iterrows():
    detail_url = f'{BASE_URL}{row["link_suffix"]}'
    
    try:
        detail_response = requests.get(detail_url, headers=HEADERS)
    except ConnectionError as e:
        from time import sleep
        sleep(15)
        detail_response = requests.get(detail_url, headers=HEADERS)
    
    if detail_response.status_code != 200:
        print(f"Failed to fetch webpage for pit {row['name']}. Status code: {detail_response.status_code}")
        continue

    detail_soup = BeautifulSoup(detail_response.content, "html.parser")
    divs = detail_soup.find_all("div", {"class": "table-responsive"})

    # Parse details table and include possible origin
    detail_table = divs[0].find("table")
    detail_rows = detail_table.find_all("tr")
    parsed_details = {detail.find("th").text.strip(): detail.find("td").text.strip() for detail in detail_rows[1:]}
    parsed_details['origin'] = detail_rows[0].find("th").text.strip()
    # This is to link the object with its details
    parsed_details['name'] = row['name']

    images_tables = divs[1].find_all("table")
    for image_table in images_tables:
        image_detail = {}
        image_detail['title'] = image_table.find("th").text.strip()
        image_detail['object'] = row['name']
        for dato in image_table.find_all("tr"):
            if dato.find("th") and dato.find("td"):
                image_detail[dato.find("th").text.strip()] = dato.find("td").text.strip(dato.find("td").text.strip())


        image_data.append(image_detail)
    detail_data.append(parsed_details)

detail_df = pd.DataFrame(detail_data)
image_df = pd.DataFrame(image_data)

ConnectionError: HTTPSConnectionPool(host='www.lroc.asu.edu', port=443): Max retries exceeded with url: /atlases/pits/62 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x71bad7aa9ac0>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [89]:
images_tables[2].find("img")['src']

'/data/support/pits/images/61/Copernicus_17a_M188378781L.png'

In [78]:
l = detail_table.find_all("tr")

In [70]:
l[0].find("th").text.strip()

'Impact Melt pit: Adams B'

In [None]:
from pymongo import MongoClient
from bson import ObjectId

client = MongoClient("mongodb://localhost:27017/")
db = client["lunar_db"]
pits_collection = db["pits"]
detailed_pits_collection = db["pits_detailed"]
images_collection = db["pit_images"]



Unnamed: 0,hosting_feature,name,latitude,longitude,funnel_max_diameter,funnel_min_diameter,inner_max_diameter,inner_max_diameter_sorting,inner_min_diameter,inner_min_diameter_sorting,azimuth,depth,depth_sorting,link_suffix
0,Adams B,Adams B 1,-31.4121,65.6408,19,10,10,10,,,110,9.0,9.0,/atlases/pits/22
1,Adams B,Adams B 2,-31.5449,65.7223,21,17,14,14,,,85,4.0,4.0,/atlases/pits/23
2,Adams B,Adams B 3,-31.2886,65.7523,38,17,14,14,,,135,2.0,2.0,/atlases/pits/24
3,Aristarchus,Aristarchus 1,23.8889,312.3506,31,16,27,27,10,10,95,13.0,13.0,/atlases/pits/25
4,Aristarchus,Aristarchus 2a,23.873,312.3029,22,18,19,19,16,16,35,5.0,5.0,/atlases/pits/26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,Virtanen F,Virtanen F 1,15.7946,177.2069,49,42,20,20,14,14,55,17.0,17.0,/atlases/pits/275
274,Weiner F,Weiner F 1,41.0722,149.9284,82,82,63,63,62,62,120,22.0,22.0,/atlases/pits/276
275,Marius Hills,West Marius Hills Pit,13.5507,301.8267,95,70,,,47,47,55,16.0,16.0,/atlases/pits/2
276,Wood T,Wood T 1a,43.8876,235.754,46,33,40,40,25,25,40,11.0,11.0,/atlases/pits/277
