### Setup

# Facility Mapping Test 7/10 - Mapping out sub basin in Eagleford

In [1]:
from google.colab import drive
import os
drive.mount('/content/gdrive') # only if using google colab

root = '/content/gdrive/My Drive/your/path' #Path where this notebook is located
os.chdir(root)

Mounted at /content/gdrive


In [2]:
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' # installs detecron2, be warned this does take a bit (~5-10 minutes)
!python -m pip install folium
!python -m pip install selenium

Collecting git+https://github.com/facebookresearch/detectron2.git
  Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-7we6w6i6
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-7we6w6i6
  Resolved https://github.com/facebookresearch/detectron2.git to commit 18f69583391e5040043ca4f4bebd2c60f0ebfde0
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yacs>=0.1.8 (from detectron2==0.6)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting fvcore<0.1.6,>=0.1.5 (from detectron2==0.6)
  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting iopath<0.1.10,>=0.1.7 (from detectron2==0.6)
  Downloading iopath-0.1.9-py3-none-any.whl.metadata (370 bytes)
Collecting hydra-core>=

In [21]:


from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode


#comment the above out if your not using detectron

from google.colab.patches import cv2_imshow

#Imports for model evaluation
import cv2

#imports for SIC

import numpy as np
import os
import tempfile
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import folium
import time
from PIL import Image

#imports for SIC test
import random #need a simulated environment to test coords w/o using AI all the time

#Imports for search area creation
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import math
from shapely.geometry import Polygon, box

#imports for file operations
from datetime import datetime

In [20]:

import json

In [None]:
#Functions for setup
from detectron2.engine import DefaultPredictor
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")  # path to the model being used (this may change in the future)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 5 #Enter the amount of classes you are using (we currently have 4 for production mapping)
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5   # threshold where the program returns a positive hit. Below this the program will return nothing.
# The threshold described in the parameters is the score needed to go to the reivew file (ie. useful for our purposes)
predictor = DefaultPredictor(cfg)

AssertionError: Checkpoint ./output/model_final.pth not found!

### SIC Functions

Below are the functions which scale SIC up to basin-scale processing

In [5]:
'''
kde_finding takes a list of coordinates (same data type as SIC), and the diagnols of each 2 mile box. It then generates a heatmap of the most likley location for the next site
the returns are the list of the top five boxes most likley to contain another site.

note - this is based on a gaussian distirbution. Sites may rely on a varigram-derived distribution or logistical-based features in practice

in the future, this function must be developed to accomodate kriging or logistical elements
- - EXAMPLE:
- - for instance, if we know a central processing facility is somwhere, the chances of wells/gathering facilities nearby are high
- - likewise, compressor stations are only clustered on the sub-basin scale. So KDE will not be useful. In fact, the opposite is true (incorporate this in next steps)
- - Wells and wellpads however will be clustered on the sub-basin scale, so KDE will be useful

'''


def kde_finding(coords, diagonals):

    #make sure your coords are within the diagnol polygon
    #prob wont cause errors, but will mess up the program

    # === 1. Input coordinates (x, y) ===
    coordinates = np.array(coords)

    # (lattitude, longitude)
    # (    Y    ,    X     )

    # === 2. Extract x and y arrays ===
    x = coordinates[:, 1]
    y = coordinates[:, 0]

    # Y is lattitude
    # X is longitude
    # The above snippet solves this



    # === 3. Apply KDE ===
    xy = np.vstack([x, y])
    kde = gaussian_kde(xy)

    # === 4. Define KDE grid ===
    xmin, xmax = x.min() - 0.1, x.max() + 0.1
    ymin, ymax = y.min() - 0.1, y.max() + 0.1
    xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
    grid_coords = np.vstack([xx.ravel(), yy.ravel()])
    zz = kde(grid_coords).reshape(xx.shape)

    # === 5. Plot heatmap ===
    plt.figure(figsize=(8, 6))
    plt.imshow(np.rot90(zz), extent=[xmin, xmax, ymin, ymax], cmap='hot', aspect='auto')
    plt.scatter(x, y, c='blue', s=10, label='Well Pad Locations')
    plt.colorbar(label='Density Estimate')
    plt.title('KDE Heatmap of Site Locations')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Dont need to divide into boxes as the boxes are already there

    hotspot_info = []

    for i in diagonals:
        x_start = i[0][1]
        x_end = i[1][1]
        y_start = i[0][0]
        y_end = i[1][0]

            # Find KDE values within this grid cell
        mask = (
            (xx >= x_start) & (xx < x_end) &
            (yy <= y_start) & (yy > y_end)
        )
        cell_density = zz[mask].mean() if np.any(mask) else 0

        hotspot_info.append({
            'density': cell_density,
            'bounding_box': ((y_start, x_start), (y_end, x_end))
        })

    # === 7. Sort and return top 10 cells with highest density ===
    top_cells = sorted(hotspot_info, key=lambda x: x['density'], reverse=True)[:5]
    diag_list = []

    # === 8. Print results ===
    print("\nTop 10 grid cells most likely to contain another site:")
    for i, cell in enumerate(top_cells, 1):
        bbox = cell['bounding_box']
        diag_list.append((cell['bounding_box'], cell['density']))
        print(f"{i:2d}. Density: {cell['density']:.4f}, Bounding Box: {bbox}")

    #now, we have the diagnols of the top 10 boxes most likley to have another coord
    #These coords can also be tagged to make sure theres no double dipping
    #Reminder: the return is a tuple

    return(diag_list)


def little_boxes(poly_coords):
    '''
    Changed the test code, now the longs per mile is dependent on the first polygon coord
    Should make this more adaptaple, but doesnt matter unless youre in the arctic or sumn
    '''
    LAT_PER_MILE = 1 / 69  # degrees per mile for latitude
    LON_PER_MILE = 1 / (69 * math.cos(math.radians(poly_coords[0][0])))  # degrees per mile for longitude
    CELL_SIZE_MILES = 2
    DELTA_LAT = CELL_SIZE_MILES * LAT_PER_MILE
    DELTA_LON = CELL_SIZE_MILES * LON_PER_MILE

    #Code still works with this change

    # === 2. Input: Your polygon (replace with your coordinates) ===
    # Format: list of (lon, lat)
    polygon = Polygon(poly_coords)

    # === 3. Get bounding box of polygon ===
    min_lon, min_lat, max_lon, max_lat = polygon.bounds

    # === 4. Generate 2-mile boxes ===
    grid_boxes = []

    lat = min_lat
    while lat < max_lat:
        lon = min_lon
        while lon < max_lon:
            cell = box(lon, lat, lon + DELTA_LON, lat + DELTA_LAT)  # Create box
            if polygon.contains(cell):  # Keep if it overlaps the polygon
                grid_boxes.append(cell)
            lon += DELTA_LON
        lat += DELTA_LAT


    # === 5. Output: Print coordinates of boxes ===
    print(f"Generated {len(grid_boxes)} 2-mile x 2-mile boxes intersecting the polygon.\n")
    diagonals = []

    for i, cell in enumerate(grid_boxes):
        maxy, minx, miny, maxx = cell.bounds
        diagonal = ((miny, minx), (maxy, maxx))  # From upper left to lower right
        diagonals.append(diagonal)
    # === 6. Optional: Plot the grid and the polygon ===
    fig, ax = plt.subplots(figsize=(8, 6))
    x, y = polygon.exterior.xy
    ax.plot(y, x, color='blue', label='Polygon')

    for cell in grid_boxes:
        x, y = cell.exterior.xy
        ax.plot(y, x, color='red', linewidth=0.5)

    ax.set_title("2-Mile x 2-Mile Grid Cells in Polygon")
    ax.set_xlabel("Longitude")
    ax.set_ylabel("Latitude")
    ax.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    return diagonals



### Below are the original SIC functions

In [56]:
def generate_coords(diag_pt1, diag_pt2, num_points):
    """
    Generate a list of equally spaced coordinates within a square
    defined by two diagonal points.

    Parameters:
        diag_pt1 (tuple): (x1, y1) - one corner of the square
        diag_pt2 (tuple): (x2, y2) - opposite corner of the square
        num_points (int): number of coordinates to generate

    Returns:
        list of (x, y) tuples
    """

    # Get the bounds
    x_min, x_max = sorted([diag_pt1[0], diag_pt2[0]])
    y_min, y_max = sorted([diag_pt1[1], diag_pt2[1]])

    # Calculate number of points per side to form a grid
    side_points = int(np.ceil(np.sqrt(num_points)))

    # Generate a grid of x and y values
    x_vals = np.linspace(x_min, x_max, side_points)
    y_vals = np.linspace(y_min, y_max, side_points)

    print(f"X vals {x_vals}")
    print(f"Y VALS {y_vals}")

    coords = [(x, y) for x in x_vals for y in y_vals]

    print(coords)

    # Trim to exactly `num_points`
    return coords #we want all the points in the grid, cant trim off the top few


def launch_driver():
    '''
    When writing this, there was a reucurring issue with the driver, this function fixed it, so we need to call it only once, idk why

    returns:
    driver - object

    '''
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=800,800")
    chrome_options.add_argument(f"--user-data-dir={tempfile.mkdtemp()}")
    chrome_options.add_argument(f"--remote-debugging-port=9222")
    return webdriver.Chrome(options=chrome_options)


def master_coords(coordinates, directory):
    '''
    Takes a list of coordinates and returns images of them

    Parameters:

    list of tuples of coordinates [(lat, long), (lat, long), ....]
    Latitude and longitude are in decimal system with positives and negatives, not degress/minutes/seconds system

    directory - file where you want your images sent to

    NOTE: This function is a major time bottlneck for SIC, because it uses a web driver which physically navigates Esri's imagery database
    Once I integrate sentinel (soon), this could cut down SIC's operation time signficantly

    Similarly, because this uses a webdriver, artifacts from the webpage may exist on the output images (watermarks, zoom buttons)
    '''



    zoom = 18 #how much were zooming in
    output_dir = os.path.join(root, directory) # directory were saving images to
    os.makedirs(output_dir, exist_ok=True) #actually making this directory

    driver = launch_driver()  # Launch once

    # === Generate map HTML with Esri imagery ===
    def generate_satellite_map(lat, lon, zoom, html_filename):
        m = folium.Map(location=[lat, lon], zoom_start=zoom, tiles=None) #folium map

        # Add Esri World Imagery
        folium.TileLayer(
            tiles="https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}",
            attr="Esri",
            name="Esri Satellite",
            overlay=False,
            control=False,
        ).add_to(m)

        m.save(html_filename)

    '''
    The previous function creates an html file with all the map stuff. This is not usable for the AI. It must be converted to JPEG
    '''


    def html_to_jpeg(html_file, jpeg_output_file):
        driver.get("file://" + os.path.abspath(html_file))
        time.sleep(2)  # wait for map tiles to load

        temp_png = jpeg_output_file.replace(".jpg", "_temp.png")
        driver.save_screenshot(temp_png)

        with Image.open(temp_png) as im:
            rgb_im = im.convert("RGB")
            rgb_im.save(jpeg_output_file, "JPEG", quality=95)

        os.remove(temp_png)

    '''
    This whole process takes a long time, and each images takes a few seconds to generate, this means only small areas should be analyzed to stay in computation limits
    '''

    '''
    Below, we loop through each coordinate, generating an image, we also delete the html file
    '''
    # === Run for each coordinate ===
    for idx, (lat, lon) in enumerate(coordinates):
        image_error = False
        html_file = f"temp_map_{idx}.html"
        jpeg_file = os.path.join(output_dir, f"satellite_{idx+1}_{lat}_{lon}.jpg")

        try:
          generate_satellite_map(lat, lon, zoom, html_file)
          html_to_jpeg(html_file, jpeg_file)

          print(f"Saved JPEG: {jpeg_file}")
          os.remove(html_file)  # optional: clean up

        except:
          break_line = input(f"Image Aquisition Error or kernal interrupted, to proceed type anything. To end type END, and then stop the kernal")
          image_error = True
          if break_line == "END": #Program will proceed
            driver.quit() #Quits the driver to prevent issues
            end_input = input("You may now end the program by stopping the kernal")

    driver.quit()

'''
This is the core of SIC
'''
#upper left coordinate, lower right coordinate, # of images taken, directory to evaluate files, boolean - set TRUE if you want to keep all images for training, boolean - set to TRUE if you want to input your own coordinates,  number of files in batch
def foureightysix(diagUL, diagLR, resolution , directory, threshhold, datamining, preset, tolerance, sampling, sample_size, model_name, looped):

  #set preset to True to input your own set of coordinates, make sure batch is equal to number of coordinates in this case
  #tolerance is the confidence in which the model thinks a site is a compressor station, lower tolerance will increase the false positives, but also decrease false negatives
  #sampling - boolean - are we taking a sample of the coords instead of taking images of every single one
  #sample size - percetnage of array we are actually taking coords of - important to keep within computing constraints

  print(f"#########################STARTING NEW COORDINATE SET \n ##########################{diagUL}, {diagLR}######################################")

  if preset == False:
    coordinates = generate_coords(diagUL, diagLR, resolution)

    coordinates = [(float(lat), float(lon)) for lat, lon in coordinates]

  if preset == True:
    coordinates = diagUL

  print(f"All coordinates: {coordinates}")

  if sampling == True:
    coordinates = random.sample(coordinates, sample_size)

  print("Coordinates generated")

  '''
  This part splits the coordinate list into batches. This allows the program to delete images we dont need in the next step

  '''

  def chunks(lst, n):
    """Return a list of n-sized chunks from lst."""
    return [lst[i:i + n] for i in range(0, len(lst), n)]

  coordinates_chunked = chunks(coordinates, threshhold)
  cord_batch = 1

  # displays each batch and their coordinates

  for i in coordinates_chunked:
    print(f"Batch {cord_batch}: {i}")
    cord_batch += 1

  print("Chunks divided")

  '''
  Importing modules again just to be sure. The threshold (no relation to the parameter above) used is 0.5
  '''

  print("Importing model")

  from detectron2.engine import DefaultPredictor
  cfg = get_cfg()
  cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
  cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, model_name)  # path to the model being used (this may change in the future)
  cfg.MODEL.ROI_HEADS.NUM_CLASSES = 5
  cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5   # set a custom testing threshold
  predictor = DefaultPredictor(cfg)


  print("Model successfully called")

  #Each file is called in the below dictionary, and assined a value depending on its need for user review
  #Results is the sets of coordinates we are returning

  master_dictionary = {}
  results = []

  i = 0


  print(coordinates_chunked)

  files_for_review = os.path.join(root, f"{directory} REVIEW FOLDER")
  os.makedirs(files_for_review, exist_ok=True)

  for batch in coordinates_chunked:

    print("##############   BEGINING NEW BATCH    #################")

    master_coords(batch, directory) # This generates the batch of images

    print("- - - IMAGES FOR THIS BATCH SUCCESSFULLY GENERATED")

    '''
    Looks for the directory created earlier
    '''

    files_path_eval = os.path.join(root, directory) # the directory were using
    time.sleep(1) #Need some time for the folder to generate, otherwise errors
    files_eval = os.listdir(files_path_eval) #makes a list of files in the directory, each one will be called

    print("- - Directory created/called")

    '''
    Here, the AI is brought in, we loop through the specified directory and evaluate each image

    '''
    print("- - Looping through files")

    for file in files_eval:
      print(f"\n\n #######################Analyzing {file}################################")
      site_dict = {}
      path = os.path.join(files_path_eval, file)
      im = cv2.imread(path) #puts the file into the image interface
      outputs = predictor(im)  # format is documented at https://detectron2.readthedocs.io/tutorials/models.html#model-output-format
      v = Visualizer(im[:, :, ::-1],
                      scale=3.0,
                      instance_mode=ColorMode.IMAGE_BW
      )

      save_path = os.path.join(files_for_review, file)

      '''
      try:

        # If the detection is signficant, but not very high
        if outputs["instances"].scores.tolist()[0] < tolerance:
          cv2.imwrite(save_path, im) #saves to review stack
          print(f"{file} Has a low Confidence interval (0.5 < C.I. < tolerance), investigate (training error? Wrong building?)")
          print(outputs["instances"].scores.tolist()[0])
          out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
          cv2_imshow(out.get_image()[:, :, ::-1])
          master_dictionary.update({file: "LOW"}) #adds to dict

        elif outputs["instances"].scores.tolist()[0] >= tolerance:
          cv2.imwrite(save_path, im) # saves to review stack
          print(f"{file} DESIRED BUILDING DETECTED, find in review folder ")
          print(outputs["instances"].scores.tolist()[0])
          out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
          cv2_imshow(out.get_image()[:, :, ::-1])
          master_dictionary.update({file: "HIGH"}) # adds to dict

      except:
          print(f"{file}: Desired Object not detected, check coordinates on google earth if you think this is a mistake")
          master_dictionary.update({file: "NONE"}) #Adds to dict, NONE marks it for removal

      print(f"Completed File {file}") #Shows the file has been completed, useful for debugging

      '''

      # Get predictions
      instances = outputs["instances"]
      pred_classes = instances.pred_classes.cpu().tolist()  # class indices
      scores = instances.scores.cpu().tolist()              # confidence scores

      #Only do analysis if theres stuff in the image
      if len(instances) > 0:
        cv2.imwrite(save_path, im) #File goes to review file

        #Visualize the image
        out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
        cv2_imshow(out.get_image()[:, :, ::-1])

        # Map class indices to labels
        class_names = ["tank battery", "well pad", "artificial lift", "separator", "compressor"]
        labels = [class_names[i] for i in pred_classes]

        # Count each object type
        from collections import Counter
        counts = Counter(labels)

        # Print scores and counts
        for label, score in zip(labels, scores):
            print(f"Detected: {label} with confidence {score:.2f}")
            if label not in site_dict:
              site_dict[label] = [score]
            else:
              site_dict[label].append(score)

        #filename contains coord data, we extract this to obtain emissions amount, as well as pass to the main dictionary for the output
        file_coord = file
        file_coord = file_coord.strip(".jgp")
        file_coord = file_coord.split("_")

        '''
        Site_sort (see code below) allows us to not have to review every single image. Some combinates of scores and features
        are almost always indicative of a actual O&G facility, and are automatically marked as positives

        On the other hand, some sites (low confidence, singular pads) are almost never a useful site, and are thrown out of the data

        This sorting is very important, and with more data, can be expanded to make manual review only necessary for extreme cases

        Some ideas could include:
        - - - Looking at the overlap of detection masks
        - - -

        '''

        sort_tag = site_sort(site_dict)

        if sort_tag == None:
          site_dict['status'] = "Check"
          print(f"No Facility could be determined\n - We record {site_dict}")
          master_dictionary[((float(file_coord[2]), float(file_coord[3])))] = site_dict



        elif sort_tag == "BAD":
          print("Only Low Confidence Pad Dectected, likley not a site")
          site_dict['status'] = 'bad'
          print(f"\nFull Site Dict \n{site_dict}\n")
          os.remove(save_path)
          #Removes bad data


        else:
          site_dict['status'] = 'good'
          site_dict['Facility Type'] = sort_tag
          print(f"{site_dict['Facility Type']} Detected")
          print(f"\nFull Site Dict \n{site_dict}\n")
          master_dictionary[((float(file_coord[2]), float(file_coord[3])))] = site_dict

      else:
        print(f"{file}: No object not detected, check coordinates on google earth if you think this is a mistake")
        #If theres no hits, dict is not updated and theres no analysis. Should save come computing

      print(f"File {file} analyzed \n\n\n")

    '''
    The batch has been properly labelled now, now the folder has to be cleaned out by removing NONE labeled files
    '''

    print(f"ALL IMAGES IN BATCH {batch} IMAGES SORTED SUCCESSFULLY, NOW CLEANING FOLDER")

    '''
    Below code cleans the folder of files that are not flagged for review. If dataming is true, these files are kept and saved to the drive.
    NOTE: turn datamining on only if you have the necessary drive space (extra 2-3 gb)

    '''


    files_clean_path = os.path.join(root, directory)
    files_clean = os.listdir(files_clean_path)
    if datamining == False:
      for file in files_clean:
        path = os.path.join(files_clean_path, file)
        os.remove(path)

    print("FOLDER SUCCESSFULLY CLEANED")

  '''
    After batches are done, go through the dictionary and build the results list
  '''

  #The user must look at each file to determine if its good. The model must always be training! Maybe if the model starts behaving we can leave this alone!
  if looped == True: #We dont want bad data being used in the next iteration, so it must be checked
    manual_review_486(f"{directory} REVIEW FOLDER", master_dictionary)

    #Goes through and removes images in the master dict that arent in the review file
    post_rev_dict = {}
    for i in coords_all:
      for file in os.listdir(files_for_review):
        file_coord = file
        file_coord = file_coord.strip(".jgp")
        file_coord = file_coord.split("_")
        if i == ((float(file_coord[2]), float(file_coord[3]))):
          post_rev_dict[i] = coords_all[i]

      coords_all = post_rev_dict

  else:
    print("Per-Batch Manual Review Disabled (Manual Review will be done after process)")




  #Replace the master dictionary with the review dictionary

  print(f"\n\n\n\n\ #############Results \n {master_dictionary}")


  return master_dictionary

#### 486 Variants
We will need some variants of this code for detection purposes
1. A variant that analyzes the objects and scores to make a decision on the sites identity

### Coordmaster Test Runs
Puts 486, KDE function, and manual review together to create a list of files with fites, as well as the dictionary with site features

In [51]:
def coord_master_test(poly_coords, looped, run_name, model_name):
    #space for coordinates
    coords_all = {}
    diagonals = []
    counter = 0

    #Run 486 for each box
    boxes = little_boxes(poly_coords)
    length = len(boxes)

    save_folder_path = os.path.join(root, f"{run_name} Save File")
    os.makedirs(save_folder_path, exist_ok=True)
    for i in boxes:
        '''
        Counter to check progress through array
        '''
        counter += 1
        print(f"##########################################################################################\n\n\n BOX {counter}, out of {length}\n\n\n")
        print(f"##########################################################################################\n\n")
        coords_all.update(foureightysix(i[0], i[1], 200, f"{run_name}", 200, False, False, 0.50, False, 0.5, model_name, looped)) #Be sure to alter these parameters here
        diagonals.append(i)

        '''
        After each box save the dictionary to a json file, if the program fails the user can access the files
        '''

        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        save_name = f"{run_name}_save{counter}.json"
        save_file_path = os.path.join(save_folder_path, save_name)

        # Write dictionary to JSON file
        with open(save_file_path, "w") as json_file:
            json.dump(str(coords_all), json_file, indent=4)

        print(f"Dictionary saved to {filename}")

    coords_all_safe = coords_all.copy() #save a copy of coords just to be safe
    print(f"\n\n\n MASTER DICTIONARY {coords_all_safe}\n\n\n") #so can be copied in the event of error

    #Run manual review
    coords_all = manual_review_486(f"{run_name} REVIEW FOLDER", coords_all_safe)

    post_rev_dict = {}
    for i in coords_all:
      for file in os.listdir(os.path.join(root, f"{run_name} REVIEW FOLDER")):
        file_coord = file
        file_coord = file_coord.strip(".jgp")
        file_coord = file_coord.split("_")
        if i == ((float(file_coord[2]), float(file_coord[3]))):
          post_rev_dict[i] = coords_all[i]

    coords_all = post_rev_dict

    #cleaning out Nonetype arguments
    coords_cleaned = []
    for k in coords_all:
        if k is not None:
            coords_cleaned.append(k)

    '''
    Plotting
    '''

    xx = []
    yy = []

    print(f"\n\n\n\n\ COORDS CLEANED: {coords_cleaned}")

    for coord in coords_cleaned:
        print(coord)
        xx.append(coord[1])
        yy.append(coord[0])

    plt.figure(figsize=(6, 6))
    plt.scatter(xx, yy, c='red', marker='o')
    plt.title("Facility Coordinates")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.grid(True)
    plt.axis('equal')  # Keeps aspect ratio consistent
    plt.show()

    #If we want to put these results back into coord master, we can return KDE results for another round of searching (the next mastercoord can have the sampling turned off)
    #otherwise, especially if the results werent sampled, we can just get a list of coords and the diagnols for future use
    #I wouldnt reccomend making this process fully automatic, as the models arent entirely accurate (only slightly more accurate than not)
    '''
    Workflow > Get polygon > run through coordmaster >

    '''

    # Get current date and time formatted for filename
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"{run_name}_{timestamp}.json"

    # Write dictionary to JSON file
    with open(filename, "w") as json_file:
        json.dump(str(coords_all), json_file, indent=4)

    print(f"Dictionary saved to {filename}")

    if looped == True:
      return kde_finding(coords_cleaned, diagonals)
    else:
      return (coords_cleaned, diagonals)

Manual Review Function - sorts mined data iinto positives and negatives

In [38]:
'''
This will speed things up, esentially we no longer have to cut and past all those files. Rather we can sort

'''

def manual_review_486(run_name, master_dictionary):
  from google.colab.patches import cv2_imshow #just so we dont have to run everything else
  import time
  import shutil

  reviewed_dict = {}

  files_check_path = os.path.join(root, f"{run_name}")
  files_check = os.listdir(files_check_path)

  print(f"Accessing files in {files_check_path}")
  print(f"Checking files: {files_check}")

  for file in files_check:
    path = os.path.join(files_check_path, file)
    im = cv2.imread(path)

    print(f"\n - - Analyzing File at {path}")

    file_coord = file
    file_coord = file_coord.strip(".jgp")
    file_coord = file_coord.split("_")

    user_facility = None
    print(f"\n\n ###################################New IMAGE##############################################\n ########{file}########")

    # a little automation for the sites that are definitley confimed

    response = 'no'

    try:

      if master_dictionary[((float(file_coord[2]), float(file_coord[3])))]['status'] == 'good':
        print("High facility confidence, moving to positives folder ")
        response = 'yes'

      else:
        print("\n\n REVIEW REQUIRED")
        cv2_imshow(im)
        time.sleep(2)
        response = input("Type 'yes' to mark as positive, or press Enter to mark as negative: ").strip().lower()
        if response == 'yes': #only if the user puts in yes during review, pre-labelled facilities do not need user input
          user_facility = input("Enter the facility name: ") #User an input the facility name
    except:
      print("Image index error, sending to Negatives")



    # Determine destination
    positive_dir = os.path.join(root, f"{run_name} positives")
    negative_dir = os.path.join(root, f"{run_name} negatives")

    # Create folders if they don't exist
    os.makedirs(positive_dir, exist_ok=True)
    os.makedirs(negative_dir, exist_ok=True)


    #Send every image, regardless of score, to the training folder
    train_path = os.path.join(root, "data", "train")
    os.makedirs(train_path, exist_ok=True)
    save = os.path.join(train_path, file)
    cv2.imwrite(save, im)

    # Move the file to respective folder
    filename = os.path.basename(path)
    if response == "yes":
        dest_path = os.path.join(positive_dir, filename)
        reviewed_dict[((float(file_coord[2]), float(file_coord[3])))] = master_dictionary[((float(file_coord[2]), float(file_coord[3])))]
        if user_facility is not None:
          reviewed_dict[((float(file_coord[2]), float(file_coord[3])))]['Facility Type'] = user_facility

    else:
        dest_path = os.path.join(negative_dir, filename)

    shutil.copy(path, dest_path)
    print(f"Image moved to: {dest_path}")
    #Now, dowload each of the folders on google drive, and use the upload tool to send to to training


  print(f"Returning dictionary: {reviewed_dict}")

  return reviewed_dict





## Basic Deployment
Lets so how this model works in a small test deployment

In [8]:
# Lets start a polygon database
poly_1 = [(28.681, -98.226), (28.675, -98.178), (28.584, -98.182), (28.589, -98.204)]

poly_2 = [(28.800, -98.098), (28.800, -98.000), (28.771, -98.000), (28.771, -98.098)]

poly_3 = [(28.800, -98.068), (28.800, -98.030), (28.400, -98.000), (28.400, -98.098)]

'''
poly_4 =
poly_5 =
'''

'\npoly_4 =\npoly_5 =\n'

### Sorting algorithm
Currently, im seeing some patterns with the model results. It looks like definite sites have a list of features and scores which can be interpreted

A good example are well pads. They have a pad score which is 90 plus. Combine that with any >50 tank battery and it will likley be a well pad barring some cases

Compressors will have a pad with 2+ compressors

Guarantee a well pad with >75 pad with any artificial lift

Processing center if theres 2+ compressors and 1+ separator

Upstream facility (well, gathering) (wont be able to determine specific type until we get well recognition) if theres a TB OR Separator + pad.

In [57]:
#class_names = ["tank battery", "well pad", "artificial lift", "separator", "compressor"]

def site_sort(site_dict):

    tag = []
    tank_val = site_dict.get('tank battery', 0)
    sep_val = site_dict.get('separator', 0)
    lift_val = site_dict.get('artificial lift', 0)
    comp_val = site_dict.get('compressor', 0)
    pad_val = site_dict.get('well pad', 0)

    #Case 0: Processing Facility

    if 'separator' in site_dict and 'compressor' in site_dict:
        if any(x > 0.5 for x in site_dict['separator']) and len(site_dict['compressor']) > 2:
            tag.append("Processing facility ")

    # Case 1: Compressor Station
    elif 'well pad' in site_dict and 'compressor' in site_dict:
        if any(x > 0.5 for x in site_dict['well pad']) and len(site_dict['compressor']) > 1:
            tag.append("Compressor Station")

    # Case 2: Upstream Facility
    elif 'tank battery' in site_dict or 'separator' in site_dict:
        if 'well pad' in site_dict and 'tank_battery' in site_dict:
            if any(x > 0.6 for x in site_dict['tank battery']) and any(x > 0.8 for x in site_dict['well pad']):
              #a ton of false positivites with tbs and seps, need a good pad threshhold
              tag.append("Upstream Facility")
        if 'well pad' in site_dict and 'separator' in site_dict:
            if any(x > 0.8 for x in site_dict['well pad']) and any(x > 0.5 for x in site_dict['separator']):
              tag.append("Upstream Facility")

    # Case 3: Well Pad
    if 'artificial lift' in site_dict and 'well pad' in site_dict:
        if any(x > 0.5 for x in site_dict['artificial lift']) and any(x > 0.6 for x in site_dict['well pad']): #If theres a pump jack and somthing like a pad, its def a ped
            tag.append("Well Pad with Artificial Lift")

    # Case 4: Fallback Well Pad or None
    elif len(tag) == 0:
        if 'well pad' in site_dict:
            if any(x > 0.85 for x in site_dict['well pad']):
                tag.append("General Pad (exact identity unknown)")
            else:
                tag = None

    #We want a BAD tag for the model to throw out this data low score single pads are almost always not valuable sites

    if tank_val == 0 and sep_val == 0 and comp_val == 0 and lift_val == 0: #Everything else needs to be zero to be sure
        if 'well pad' in site_dict and len(pad_val) == 1:
            if all(x < 0.7 for x in site_dict['well pad']):
                print("Low Confidence Pad Dectected, likley not a site")
                tag = "BAD"

    if tag == []:
      print("Site cannot be catagorized")
      tag = None

    return tag





## Exporting
Below will be a list of scripts that take our JSON file and convert it into stuff usable for viewers

In [44]:
import csv
import re
import ast

In [53]:
file_path = os.path.join(root, "486 Eagleford 714 Test_2025-07-14_16-05-54.json") #paste name of JSON file

def strip_np_float64(text):
    # Replace np.float64(123.456) → 123.456
    return re.sub(r'np\.float64\(([^()]+)\)', r'\1', text)

def load_custom_dict(filepath):
    with open(filepath, 'r') as f:
        raw = f.read().strip()

    # Remove outer quotes if present
    if raw.startswith('"') and raw.endswith('"'):
        raw = raw[1:-1]

    # Strip np.float64() calls
    cleaned = strip_np_float64(raw)

    # Now parse it safely
    data = ast.literal_eval(cleaned)
    return data

data_dict = load_custom_dict(file_path)

In [54]:
data = data_dict

# Output file name
filename = "EF 200 Test.csv"

# Collect all possible fieldnames
fieldnames = ['latitude', 'longitude']
# Add any other keys found in the inner dictionaries
for value in data.values():
    for key in value.keys():
        if key not in fieldnames:
            fieldnames.append(key)

# Write CSV
with open(filename, mode='w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()

    for (lat, lon), values in data.items():
        row = {'latitude': lat, 'longitude': lon}
        row.update(values)  # Add the rest of the values
        writer.writerow(row)

print(f"CSV written to {filename}")

CSV written to EF 200 Test.csv


KML File importing

In [None]:
import xml.etree.ElementTree as ET

def extract_kml_polygon_coords(filepath):
    tree = ET.parse(filepath)
    root = tree.getroot()

    # KML uses namespaces; we must define them
    ns = {'kml': 'http://www.opengis.net/kml/2.2'}

    # Find all <coordinates> elements inside polygons
    coords = []
    for coord_tag in root.findall('.//kml:Polygon//kml:coordinates', ns):
        raw = coord_tag.text.strip()
        # Each point is "lon,lat[,alt]" and separated by spaces
        for line in raw.split():
            lon_lat = line.strip().split(',')[:2]  # Ignore altitude if present
            lon, lat = map(float, lon_lat)
            coords.append((lon, lat))

    return coords

[(-97.73178684214577, 28.87748549626617), (-97.69376103536057, 28.90759227464033), (-97.62914833723072, 28.92292711105228), (-97.59450960413132, 28.9542944115274), (-97.57600515435394, 28.98947835115226), (-97.60520023844285, 29.02693342291385), (-97.61325172752822, 29.04305344782935), (-97.67062634340834, 29.07900027031357), (-97.71550413691881, 29.09239011428892), (-97.75390291902202, 29.10588023767992), (-97.76703427084135, 29.11109496668885), (-97.79330279096142, 29.05081135280534), (-97.81359428342346, 29.00120846817292), (-97.80862861530086, 28.95770880940933), (-97.81091538258264, 28.9343922586583), (-97.85455264326222, 28.90495864645925), (-97.87672726199224, 28.88480386867914), (-97.86157514672466, 28.8419900070367), (-97.81144822639831, 28.82714216766382), (-97.7561844924435, 28.84621963108358), (-97.73062716801826, 28.86613273808085), (-97.73178684214577, 28.87748549626617)]
[(-97.81802361108114, 28.95122558076191), (-97.86448084880845, 28.87971119491164), (-97.80308959867, 

## Next stage of deployment

In [None]:
poly_T4 = [(29.175, -97.600), (29.175, -97.475), (29.051, -97.475), (29.051, -97.600)]
coord_master_test(poly_T4, False, "486 Eagleford 714 Test 2", "tb_model_v3.pth")