<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Importing-the-Data" data-toc-modified-id="Importing-the-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Importing the Data</a></span><ul class="toc-item"><li><span><a href="#Metadata-File" data-toc-modified-id="Metadata-File-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span><code>Metadata</code> File</a></span></li><li><span><a href="#train_labels-File" data-toc-modified-id="train_labels-File-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span><code>train_labels</code> File</a></span></li><li><span><a href="#Prepping-the-data-for-the-Satellite-imagery-analysis." data-toc-modified-id="Prepping-the-data-for-the-Satellite-imagery-analysis.-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Prepping the data for the Satellite imagery analysis.</a></span></li><li><span><a href="#Setting-up-the-DataFrame" data-toc-modified-id="Setting-up-the-DataFrame-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Setting up the DataFrame</a></span></li></ul></li><li><span><a href="#Pulling-in-All-of-the-Data" data-toc-modified-id="Pulling-in-All-of-the-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Pulling in All of the Data</a></span><ul class="toc-item"><li><span><a href="#Pulling-in-the-first-half-of-the-data." data-toc-modified-id="Pulling-in-the-first-half-of-the-data.-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Pulling in the first half of the data.</a></span></li><li><span><a href="#Pulling-in-the-second-half-of-the-data" data-toc-modified-id="Pulling-in-the-second-half-of-the-data-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Pulling in the second half of the data</a></span></li><li><span><a href="#Pulling-in-the-third-set-of-data" data-toc-modified-id="Pulling-in-the-third-set-of-data-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Pulling in the third set of data</a></span></li><li><span><a href="#Creating-a-Full-DataFrame" data-toc-modified-id="Creating-a-Full-DataFrame-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Creating a Full DataFrame</a></span></li></ul></li></ul></div>

Running main notebook

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import geopandas as gpd
from shapely.geometry import Point
import geopy.distance as distance

import planetary_computer as pc
from pystac_client import Client

from datetime import datetime
from datetime import timedelta

# from keras.utils import load_img, img_to_array
import requests
from PIL import Image
from io import BytesIO

from tqdm import tqdm
tqdm.pandas()

import rioxarray
import cv2
import odc.stac
import tempfile
import rasterio
import os

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import functions

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Can use this if I decide to use multiple satelitte images
def get_sat_info(df):

    '''
    input a dataframe and get a dictionary with satellite information for each row in the dataframe
    '''
    
    sat_dict = {}
    for index in range(len(df)):
        row = df.iloc[index]

        # Get all satellite images
        search = catalog.search(collections=["sentinel-2-l2a", "landsat-c2-l2"],
                                bbox=row['bbox'],
                                datetime=row['date_range'],
                                query={'eo:cloud_cover': {'lt':100}}
    )


        # Going through Satellite info

        # search for sat images and create a dataframe with results for one sample
#         search_items = [item for item in search.get_all_items()]
        search_items = [item for item in search.item_collection()]


        pic_details = []
        for pic in search_items:
            pic_details.append(
            {
            'item': pic,
            'satelite_name':pic.collection_id,
            'img_date':pic.datetime.date(),
            'cloud_cover(%)': pic.properties['eo:cloud_cover'],
            'img_bbox': pic.bbox,
            'min_long': pic.bbox[0],
            "max_long": pic.bbox[2],
            "min_lat": pic.bbox[1],
            "max_lat": pic.bbox[3]
            }
            )

        temp_df = pd.DataFrame(pic_details)

        # Check to make sure sample location is actually within sat image
        temp_df['has_sample_point'] = (
            (temp_df.min_lat < row.latitude)
            & (temp_df.max_lat > row.latitude)
            & (temp_df.min_long < row.longitude)
            & (temp_df.max_long > row.longitude)
        )

        temp_df = temp_df[temp_df['has_sample_point'] == True]
        sat_dict[row['uid']] = temp_df
        
    return sat_dict

In [3]:
# delete comments for prints (# is all on left edge)
def pick_best_sat(df, sat_dict):
    
    '''
    input a dataframe and dictionary of satellite images and returns a dataframe with the best satellite image
    '''
    
    # picking the best
    # inputs would need to be df and dictionary
    best_sat_df = pd.DataFrame()
    row_count=0
    invalid_sats = 0
    for index in range(len(df)):
        row = df.iloc[index]

        name = row['uid']
        temp_df = sat_dict[name]
        temp_df = temp_df.reset_index()
        # checking to see if there's only one image and adding it to df if so
        if len(temp_df) == 1:
#             print('only one satellite')
            temp_df = temp_df.reset_index().drop(['index','min_long', 'max_long', 'min_lat', 'max_lat'], axis=1)
            row = pd.DataFrame(row).T.reset_index().join(temp_df, how='outer')
            row = row.set_index(pd.Series(row_count)).drop(['level_0', 'index'], axis=1)
            best_sat_df = pd.concat([best_sat_df, row])
            row_count+=1

        # checking if no images
        elif len(temp_df) == 0:
            invalid_sats +=1
            row = pd.DataFrame(row).T.reset_index()
            row = row.set_index(pd.Series(row_count)).drop('index', axis=1)
            best_sat_df = pd.concat([best_sat_df, row])
            row_count+=1
#             print('no satellite images')
            continue

        # There are many satellite images, need to narrow it down
        else:
#             print('many sats')
            # first checking for any sentinel satelites
            if len(temp_df[temp_df['satelite_name'].str.contains('entinel')]) >0:
                    temp_df = temp_df[temp_df['satelite_name'].str.contains('entinel')]

                    # if only one sentinel, add to df and move on
                    if len(temp_df) == 1:
#                         print('\tonly one sentinal')
                        temp_df = temp_df.reset_index().drop(['index','min_long', 'max_long', 'min_lat', 'max_lat'], axis=1)
                        row = pd.DataFrame(row).T.reset_index().join(temp_df, how='outer')
                        row = row.set_index(pd.Series(row_count)).drop(['level_0', 'index'], axis=1)
                        best_sat_df = pd.concat([best_sat_df, row])
                        row_count+=1
                    # if many sentinel, check for images with low cloud cover
                    else:
#                         print('\tmany sentinel')
                        # checking for clouds less than 30%
                        if len(temp_df[temp_df['cloud_cover(%)'] <= 30]) >0:
#                             print('\t\tsentinal cloud cover lower than 30%')
                            temp_df = temp_df[temp_df['cloud_cover(%)'] <= 30]

                            # add the row with the closest date
                            temp_df = temp_df.sort_values('img_date', ascending=False).reset_index().drop(['index','min_long', 'max_long', 'min_lat', 'max_lat'], axis=1)
                            temp_df = pd.DataFrame(temp_df.loc[0]).T
                            row = pd.DataFrame(row).T.reset_index().join(temp_df, how='outer')
                            row = row.set_index(pd.Series(row_count)).drop(['level_0', 'index'], axis=1)
                            best_sat_df = pd.concat([best_sat_df, row])
                            row_count+=1
                        else:
                            # If there's only images with a clouds over 30%, 
                            # pick the one with the least clouds
#                             print('\t\tvery cloudy sentinel')
                            temp_df = temp_df.sort_values('cloud_cover(%)', ascending=True).reset_index().drop(['index','min_long', 'max_long', 'min_lat', 'max_lat'], axis=1)
                            temp_df = pd.DataFrame(temp_df.loc[0]).T
                            row = pd.DataFrame(row).T.reset_index().join(temp_df, how='outer')
                            row = row.set_index(pd.Series(row_count)).drop(['level_0', 'index'], axis=1)
                            best_sat_df = pd.concat([best_sat_df, row])
                            row_count+=1

            else:
#                 print('\tno sentinal')
                if len(temp_df[temp_df['cloud_cover(%)'] <= 30]) >0:
#                     print('\t\tlandsat cloud cover lower than 30%')
                    temp_df = temp_df[temp_df['cloud_cover(%)'] <= 30]

                    # add the row with the closest date
                    temp_df = temp_df.sort_values('img_date', ascending=False).reset_index().drop(['index','min_long', 'max_long', 'min_lat', 'max_lat'], axis=1)
                    temp_df = pd.DataFrame(temp_df.loc[0]).T
                    row = pd.DataFrame(row).T.reset_index().join(temp_df, how='outer')
                    row = row.set_index(pd.Series(row_count)).drop(['level_0', 'index'], axis=1)
                    best_sat_df = pd.concat([best_sat_df, row])
                    row_count+=1
                else:
                    # If there's only images with a clouds over 30%, 
                    # pick the one with the least clouds
#                     print('\t\tvery cloudy landsat')
                    temp_df = temp_df.sort_values('cloud_cover(%)', ascending=True).reset_index().drop(['index','min_long', 'max_long', 'min_lat', 'max_lat'], axis=1)
                    temp_df = pd.DataFrame(temp_df.loc[0]).T
                    row = pd.DataFrame(row).T.reset_index().join(temp_df, how='outer')
                    row = row.set_index(pd.Series(row_count)).drop(['level_0', 'index'], axis=1)
                    best_sat_df = pd.concat([best_sat_df, row])
                    row_count+=1



    print(f'{len(df)} attempts. {invalid_sats} failures.')
    return best_sat_df

In [4]:
def get_arrays_from_sats(df):
    
    
    '''
    input a dataframe with satellites in it and get a dictionary with arrays 
    that came from cropped images around the sample area
    '''

# Now to get images from the satellites
    array_dict = {}
    scaler = functions.MinMaxScaler3D(feature_range=(0,255))
    error_count = 0
    attempt_count = 0
    for index in range(len(df)):
        row = df.iloc[index]


        try:
            attempt_count +=1
        # checking to see which satellite it came from
            if 'sentinel' in row['satelite_name']:
                # Setting tiny crop box for image
                minx, miny, maxx, maxy = row['tiny_crop_bbox']
                # getting the image
                image = rioxarray.open_rasterio(pc.sign(row['item'].assets["visual"].href)).rio.clip_box(
                        minx=minx,
                        miny=miny,
                        maxx=maxx,
                        maxy=maxy,
                        crs="EPSG:4326",
                    )

                image_array = image.to_numpy()
                img_array_trans = np.transpose(image_array, axes=[1, 2, 0])
                # storing array of image in dictionary
                array_dict[row['uid']] = img_array_trans

            else:
                # getting the image from the LandSat satellite
                minx, miny, maxx, maxy = row['tiny_crop_bbox']
                image = odc.stac.stac_load(
                        [pc.sign(row['item'])], bands=["red", "green", "blue"], bbox=[minx, miny, maxx, maxy]
                    ).isel(time=0)

                image_array = image[["red", "green", "blue"]].to_array()
                img_array_trans = np.transpose(image_array.to_numpy(), axes=[1, 2, 0])
                # scaling the image so its the same scale as the sentinel ones
                scaled_img = scaler.fit_transform(img_array_trans)
        #         int_scaled_img = scaled_img.astype(int)
                # storing array of image in dictionary
                array_dict[row['uid']] = scaled_img
                

        except:
            error_count +=1
            
            
    print(f'{attempt_count} attempted. {error_count} failures.')
    return array_dict

In [5]:
def get_features(df, img_arrays):
    '''
    input a dataframe and a list of integers and create features from arrays
    '''
    feature_df = pd.DataFrame()
    for index in range(len(img_arrays.keys())):
        feature_dict = {}
        key =list(img_arrays.keys())[index]
#         row = df.iloc[index]
        temp_array = img_arrays[key]
        for n, color in enumerate(['red', 'green', 'blue']):
            feature_dict['uid'] = key
            feature_dict[f'{color}_mean'] = np.mean(temp_array[:,:,n])
            feature_dict[f'{color}_median'] = np.median(temp_array[:,:,n])
            feature_dict[f'{color}_max'] = np.max(temp_array[:,:,n])
            feature_dict[f'{color}_min'] = np.min(temp_array[:,:,n])
            feature_dict[f'{color}_sum'] = np.sum(temp_array[:,:,n])
            feature_dict[f'{color}_product'] = np.prod(temp_array[:,:,n])
        feature_df = pd.concat([feature_df, pd.DataFrame(feature_dict, index=[index])], )

    feature_df = df.merge(feature_df, how='outer', on='uid')
    return feature_df

In [6]:
# A function to get it all in one
def get_sat_to_features(df):
    
    '''
    input a dataframe of raw data and get sat images, convert to arrays, and turn into features.
    '''
    catalog = Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1", modifier=pc.sign_inplace
    )
    
    # get sat info
    satelite_dict = get_sat_info(df)
    
    # pick best sat
    single_df = pick_best_sat(df, satelite_dict)
    
    # get image arrays from best sats
    img_arrays = get_arrays_from_sats(single_df)
    
    # get a dataframe with relevant features
    feature_df = get_features(single_df, img_arrays)
    
    return feature_df

In [7]:
def clean_data(df):
    '''
    input dataframe with all data and clean it.
    '''
    # only keeping cols that I need
    model_df = df[['date', 'latitude', 'longitude', 'season', 'img_date',
            'red_mean', 'red_median', 'red_max', 'red_min','red_sum',
            'red_product', 'green_mean', 'green_median', 'green_max',
            'green_min', 'green_sum', 'green_product', 'blue_mean',
            'blue_median','blue_max', 'blue_min', 'blue_sum', 'blue_product', 'severity']]
    # dropping nulls
    model_df = model_df.dropna()
    # converting to correct type
    model_df['date'] = model_df['date'].apply(lambda x: datetime.date(x))
    # getting difference from image date to sample date and creating feature
    model_df['days_from_sat_to_sample'] = model_df['date'] - model_df['img_date']
    # converting to int
    model_df['days_from_sat_to_sample'] = model_df['days_from_sat_to_sample'].dt.days
    # converting from datetime to an int
    model_df['date'] = model_df['date'].apply(lambda x: x.toordinal())
    model_df['img_date'] = model_df['img_date'].apply(lambda x: x.toordinal())
    # converting from string to float
    model_df['latitude'] = model_df['latitude'].apply(lambda x: x.astype(float))
    model_df['longitude'] = model_df['longitude'].apply(lambda x: x.astype(float))
    
    # One hot encoding seasons
    ohe = OneHotEncoder(sparse=False)
    seasons = ohe.fit_transform(model_df[['season']])
    cols = ohe.get_feature_names_out()
    # converting new ohe to dataframe
    seasons = pd.DataFrame(seasons, columns=cols, index=model_df.index)
    model_ohe = pd.concat([model_df.drop('season', axis=1), seasons], axis=1)
    return model_ohe

# Importing the Data

## `Metadata` File

In [8]:
# Reading in the data and bringing in date as datetime dtype
metadata = pd.read_csv('Data/metadata.csv', parse_dates=['date'])

## `train_labels` File

In [13]:
train_labels = pd.read_csv('Data/train_labels.csv')

## Prepping the data for the Satellite imagery analysis.

In [55]:
sat_df = metadata.reset_index()

In [56]:
sat_df['split'].value_counts()

train    17060
test      6510
Name: split, dtype: int64

In [57]:
sat_train = sat_df[sat_df['split'] == 'train'].copy()
sat_test = sat_df[sat_df['split'] == 'test'].copy()

Bringing back in the labels for sat_train.

In [None]:
sat_train = sat_train.merge(train_labels, on='uid')

## Setting up the DataFrame

Here I use a custom function to add a date range that the satellites can interpret and also include bounding boxes to later manipulate the images.

In [62]:
functions.get_important_info(sat_train, dist=31, big_crop_dist=3000, small_crop_dist=500, tiny_crop_dist=100);

# Pulling in All of the Data

Because the API is sometimes unstable. I will be pulling over the data in two large batches with several smaller batches making up the larger batches. I am splitting the data into batches below.

In [122]:
all_train = list(np.arange(0, len(sat_train), 853))

In [123]:
first_half = all_train[:int(len(all_train)/2)]

first_half_dict = {}
for batch in range(1, len(first_half)):
    first_half_dict[f'sat_train_{batch}'] = sat_train[first_half[batch-1]:first_half[batch]]
    
first_half_dict.keys()

dict_keys(['sat_train_1', 'sat_train_2', 'sat_train_3', 'sat_train_4', 'sat_train_5', 'sat_train_6', 'sat_train_7', 'sat_train_8', 'sat_train_9'])

In [124]:
first_half_dict['sat_train_9'].head()

Unnamed: 0,date,uid,latitude,longitude,split,season,geometry,region,severity,density,date_range,bbox,big_crop_bbox,small_crop_bbox,tiny_crop_bbox
6824,2016-08-15,alqs,39.21067,-96.97298,train,summer,POINT (-96.97298 39.21067),midwest,3,722295.0,2016-07-31/2016-08-15,"[-97.55060654290345, 38.761275165744365, -96.3...","[-97.00771470990246, 39.1836476583244, -96.938...","[-96.97876911859264, 39.206166285116375, -96.9...","[-96.97413782372004, 39.20976925730261, -96.97..."
6825,2016-08-15,ranz,41.550878,-86.361626,train,summer,POINT (-86.36163 41.55088),midwest,3,907606.0,2016-07-31/2016-08-15,"[-86.95959314901351, 41.101665147612685, -85.7...","[-86.39758397335977, 41.523866492260474, -86.3...","[-86.36761901742425, 41.54637598255136, -86.35...","[-86.36282462415333, 41.549977492793204, -86.3..."
6826,2016-08-15,danb,40.702492,-85.935575,train,summer,POINT (-85.93558 40.70249),midwest,3,435819.0,2016-07-31/2016-08-15,"[-86.52589000317576, 40.25321251336449, -85.34...","[-85.97107276486848, 40.675476171499525, -85.9...","[-85.94149129445773, 40.697988987622125, -85.9...","[-85.93675825889326, 40.70159103002851, -85.93..."
6827,2016-08-15,zegx,37.610161,-97.624344,train,summer,POINT (-97.62434 37.61016),midwest,3,327600.0,2016-07-31/2016-08-15,"[-98.18936911934287, 37.160643680069484, -97.0...","[-97.65832135301534, 37.5831315997638, -97.590...","[-97.63000730074292, 37.60565639191241, -97.61...","[-97.6254770521499, 37.609260350658296, -97.62..."
6828,2016-08-15,ffia,39.09447,-96.90462,train,summer,POINT (-96.90462 39.09447),midwest,2,56700.0,2016-07-31/2016-08-15,"[-97.48129742466935, 38.64506619456678, -96.32...","[-96.93929763224098, 39.067447118042836, -96.8...","[-96.91039960564633, 39.08996619506234, -96.89...","[-96.90577592113075, 39.09356923929158, -96.90..."


In [125]:
all_train.append(len(sat_train))
second_half = all_train[9:]

second_half_dict = {}
for batch in range(10, len(second_half)+9):
    second_half_dict[f'sat_train_{batch}'] = sat_train[second_half[batch-10]:second_half[batch-9]]
    
second_half_dict.keys()

dict_keys(['sat_train_10', 'sat_train_11', 'sat_train_12', 'sat_train_13', 'sat_train_14', 'sat_train_15', 'sat_train_16', 'sat_train_17', 'sat_train_18', 'sat_train_19', 'sat_train_20'])

In [126]:
second_half_dict['sat_train_20'].tail()

Unnamed: 0,date,uid,latitude,longitude,split,season,geometry,region,severity,density,date_range,bbox,big_crop_bbox,small_crop_bbox,tiny_crop_bbox
17055,2021-12-14,mdcu,35.98,-78.843884,train,winter,POINT (-78.84388 35.98000),south,2,34492.0,2021-11-29/2021-12-14,"[-79.3970618475286, 35.530359136482794, -78.29...","[-78.87714860225577, 35.95296283511968, -78.81...","[-78.84942825259974, 35.97549381433137, -78.83...","[-78.84499299645525, 35.97909876313759, -78.84..."
17056,2021-12-14,bnne,36.03,-78.706678,train,winter,POINT (-78.70668 36.03000),south,2,33312.0,2021-11-29/2021-12-14,"[-79.2602050430223, 35.58036288301211, -78.153...","[-78.73996346358412, 36.00296306094821, -78.67...","[-78.71222560689303, 36.025493851974176, -78.7...","[-78.70778754962208, 36.02909877066631, -78.70..."
17057,2021-12-14,zoaj,36.06,-78.76,train,winter,POINT (-78.76000 36.06000),south,2,48233.0,2021-11-29/2021-12-14,"[-79.31373702815989, 35.610365131969274, -78.2...","[-78.79329805829768, 36.03296319650643, -78.72...","[-78.76554967659342, 36.05549387457003, -78.75...","[-78.76110993531984, 36.059098775185575, -78.7..."
17058,2021-12-14,ngih,36.03,-78.705684,train,winter,POINT (-78.70568 36.03000),south,1,1452.0,2021-11-29/2021-12-14,"[-79.25921079557939, 35.58036288301211, -78.15...","[-78.73896921614121, 36.00296306094821, -78.67...","[-78.71123135945012, 36.025493851974176, -78.7...","[-78.70679330217918, 36.02909877066631, -78.70..."
17059,2021-12-14,ngxt,35.98,-78.844133,train,winter,POINT (-78.84413 35.98000),south,1,5187.0,2021-11-29/2021-12-14,"[-79.3973104093893, 35.530359136482794, -78.29...","[-78.87739716411647, 35.95296283511968, -78.81...","[-78.84967681446044, 35.97549381433137, -78.83...","[-78.84524155831595, 35.97909876313759, -78.84..."


## Pulling in the first half of the data.

In [128]:
# commented out due to having completed and pickled the results
# first_half_key_list = list(first_half_dict.keys())

In [129]:
# commented out due to having completed and pickled the results


# first_half_results_dict = {}
# for n, key in enumerate(first_half_key_list):
#     first_half_results_dict[key] = get_sat_to_features(first_half_dict[key])
#     print(f"{key} has finished loading.")

## Pulling in the second half of the data

In [130]:
second_half_key_list = list(second_half_dict.keys())

In [131]:
# second_half_results_dict = {}
# for key in second_half_key_list:
#     second_half_results_dict[key] = get_sat_to_features(second_half_dict[key])
#     print(f"{key} has finished loading.")

In [132]:
# second_half_results_dict.keys()

## Pulling in the third set of data

The API is incredibly finicky. Only sat_train_10-sat_train_13 was successful. I am running another pull request.

In [134]:
# third_pull_key_list = second_half_key_list[4:]

In [135]:
# third_pull_results_dict = {}
# for key in third_pull_key_list:
#     third_pull_results_dict[key] = get_sat_to_features(second_half_dict[key])
#     print(f"{key} has finished loading.")

853 attempts. 0 failures.
853 attempted. 23 failures.
sat_train_14 has finished loading.
853 attempts. 0 failures.
853 attempted. 26 failures.
sat_train_15 has finished loading.
853 attempts. 0 failures.
853 attempted. 2 failures.
sat_train_16 has finished loading.
853 attempts. 0 failures.
853 attempted. 10 failures.
sat_train_17 has finished loading.
853 attempts. 0 failures.
853 attempted. 4 failures.
sat_train_18 has finished loading.
853 attempts. 0 failures.
853 attempted. 6 failures.
sat_train_19 has finished loading.
853 attempts. 0 failures.


Aborting load due to failure while reading: https://landsateuwest.blob.core.windows.net/landsat-c2/level-2/standard/etm/2021/015/032/LE07_L2SP_015032_20211028_20211123_02_T1/LE07_L2SP_015032_20211028_20211123_02_T1_SR_B3.TIF?st=2023-02-02T21%3A23%3A32Z&se=2023-02-03T22%3A08%3A32Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2023-02-03T20%3A37%3A34Z&ske=2023-02-10T20%3A37%3A34Z&sks=b&skv=2021-06-08&sig=xXuy5bwFMA17LpgI6bZP2eEJ%2BpGSeIiPsyQ3%2BknRn2w%3D:1
Aborting load due to failure while reading: https://landsateuwest.blob.core.windows.net/landsat-c2/level-2/standard/etm/2021/015/032/LE07_L2SP_015032_20211028_20211123_02_T1/LE07_L2SP_015032_20211028_20211123_02_T1_SR_B3.TIF?st=2023-02-02T21%3A23%3A32Z&se=2023-02-03T22%3A08%3A32Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2023-02-03T20%3A37%3A34Z&ske=2023-02-10T20%3A37%3A34Z&sks=b&skv=2021-06-08&s

Aborting load due to failure while reading: https://landsateuwest.blob.core.windows.net/landsat-c2/level-2/standard/oli-tirs/2021/016/035/LC08_L2SP_016035_20211027_20211104_02_T1/LC08_L2SP_016035_20211027_20211104_02_T1_SR_B4.TIF?st=2023-02-02T21%3A23%3A32Z&se=2023-02-03T22%3A08%3A32Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2023-02-03T20%3A37%3A34Z&ske=2023-02-10T20%3A37%3A34Z&sks=b&skv=2021-06-08&sig=xXuy5bwFMA17LpgI6bZP2eEJ%2BpGSeIiPsyQ3%2BknRn2w%3D:1
Aborting load due to failure while reading: https://landsateuwest.blob.core.windows.net/landsat-c2/level-2/standard/oli-tirs/2021/016/035/LC08_L2SP_016035_20211027_20211104_02_T1/LC08_L2SP_016035_20211027_20211104_02_T1_SR_B4.TIF?st=2023-02-02T21%3A23%3A32Z&se=2023-02-03T22%3A08%3A32Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2023-02-03T20%3A37%3A34Z&ske=2023-02-10T20%3A37%3A34Z&sks=b&skv=20

Aborting load due to failure while reading: https://landsateuwest.blob.core.windows.net/landsat-c2/level-2/standard/oli-tirs/2021/016/035/LC08_L2SP_016035_20211027_20211104_02_T1/LC08_L2SP_016035_20211027_20211104_02_T1_SR_B4.TIF?st=2023-02-02T21%3A23%3A32Z&se=2023-02-03T22%3A08%3A32Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2023-02-03T20%3A37%3A34Z&ske=2023-02-10T20%3A37%3A34Z&sks=b&skv=2021-06-08&sig=xXuy5bwFMA17LpgI6bZP2eEJ%2BpGSeIiPsyQ3%2BknRn2w%3D:1
Aborting load due to failure while reading: https://landsateuwest.blob.core.windows.net/landsat-c2/level-2/standard/oli-tirs/2021/016/035/LC08_L2SP_016035_20211027_20211104_02_T1/LC08_L2SP_016035_20211027_20211104_02_T1_SR_B4.TIF?st=2023-02-02T21%3A23%3A32Z&se=2023-02-03T22%3A08%3A32Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2023-02-03T20%3A37%3A34Z&ske=2023-02-10T20%3A37%3A34Z&sks=b&skv=20

KeyboardInterrupt: 

Exception ignored in: 'rasterio._env.log_error'
Traceback (most recent call last):
  File "C:\Users\nacnu\anaconda3\envs\capstone\lib\logging\__init__.py", line 1467, in info
    def info(self, msg, *args, **kwargs):
KeyboardInterrupt: 
Aborting load due to failure while reading: https://landsateuwest.blob.core.windows.net/landsat-c2/level-2/standard/oli-tirs/2021/014/032/LC09_L2SP_014032_20211109_20220119_02_T1/LC09_L2SP_014032_20211109_20220119_02_T1_SR_B4.TIF?st=2023-02-02T21%3A23%3A32Z&se=2023-02-03T22%3A08%3A32Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2023-02-03T20%3A37%3A34Z&ske=2023-02-10T20%3A37%3A34Z&sks=b&skv=2021-06-08&sig=xXuy5bwFMA17LpgI6bZP2eEJ%2BpGSeIiPsyQ3%2BknRn2w%3D:1


853 attempted. 317 failures.
sat_train_20 has finished loading.


## Creating a Full DataFrame

Now that I have the data in a dictionary, I will concat it all together into a complete dataframe.

In [136]:
# commented out due to having completed and pickled the results
# First pull to DataFrame
# first_pull_df = pd.concat(first_half_results_dict.values())

In [137]:
# Second pull to DataFrame
# second_pull_df = pd.concat(second_half_results_dict.values())

In [143]:
# Third pull to DataFrame
# third_pull_df = pd.concat(third_pull_results_dict.values())

I also need to store the pulled data as a .pkl file.

In [138]:
# first_pull_df.to_pickle('./first_7677_rows.pkl')

In [148]:
# first_pull_df = pd.read_pickle('./first_7677_rows.pkl')

In [140]:
# second_pull_df.to_pickle('./second_set_rows.pkl')

In [146]:
# second_pull_df = pd.read_pickle('./second_set_rows.pkl')

In [145]:
# third_pull_df.to_pickle('./third_set_rows.pkl')

In [None]:
# third_pull_df = pd.read_pickle('./third_set_rows.pkl')

In [149]:
# full_df = pd.concat([first_pull_df, second_pull_df, third_pull_df])

In [151]:
# full_df.to_pickle('./full_df.pkl')

In [114]:
new_full = pd.read_pickle('../full_df.pkl')