## Dependencies

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
import os, zipfile
import shutil
import glob

from pymongo import MongoClient
import time

## Extract data

In [2]:
# Running this cell will unzip the data files in the Resources folder.
extension = ".zip"
extracted_dir_name = "."

# Get the current working directory.
# Need to be in the root directory of this project for this to work.
cwd_dir_name = os.getcwd()
print(f"The current working directory is {cwd_dir_name}.")

os.chdir("Resources") # change directory from working dir to dir with the zip file(s) .
# This should be the "Resources" folder.
dir_name = os.getcwd()
print(f"You are now in the following directory: {dir_name}.")

for item in os.listdir(dir_name): # loop through the items in the directory.
    if item.endswith(extension): # check for ".zip" extension"
        try:
            file_name = os.path.abspath(item) # get full path of files
            zip_ref = zipfile.ZipFile(file_name) # create zipfile object
            unzipped_directory = os.path.join(extracted_dir_name) # reference to the directory where the zip files will be extracted.
            zip_ref.extractall(unzipped_directory) # extract file to dir
            zip_ref.close() # close file
            print(f"Successfully unzipped {item} into the following folder:{dir_name}.")
        except:
            print(f"Error trying to unzip data file(s).")
            print(f"Make sure that the files are closed and you have the correct file/folder permissions.")
            
# Go up one directory into the project root directory.
os.chdir(os.path.normpath(os.getcwd() + os.sep + os.pardir))
print(os.path.normpath(os.getcwd() + os.sep + os.pardir))

The current working directory is C:\Users\phili\Desktop\flask-heroku-deploy\pet_pals.
You are now in the following directory: C:\Users\phili\Desktop\flask-heroku-deploy\pet_pals\Resources.
Successfully unzipped fires-from-space-australia-and-new-zeland.zip into the following folder:C:\Users\phili\Desktop\flask-heroku-deploy\pet_pals\Resources.
C:\Users\phili\Desktop\flask-heroku-deploy


## Import csv files and read into pandas dataframes

In [3]:
# Path to the csv files.
path_to_csvs = os.path.join(".", "Resources")
all_files = glob.glob(os.path.join(path_to_csvs, "*.csv"))

df_from_each_file = []

for f in all_files:
    filename = os.path.basename(f)
    df = pd.read_csv(f, encoding ="ISO-8859-1")
    df_from_each_file.append(df)

# Concantenated dataframe
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
concatenated_df

Unnamed: 0,acq_date,acq_time,bright_t31,bright_ti4,bright_ti5,brightness,confidence,daynight,frp,instrument,latitude,longitude,satellite,scan,track,type,version
0,2019-08-01,56,297.3,,,313.0,48,D,6.6,MODIS,-11.80700,142.05830,Terra,1.00,1.00,0.0,6.3
1,2019-08-01,56,297.3,,,319.3,71,D,11.3,MODIS,-11.79240,142.08500,Terra,1.00,1.00,0.0,6.3
2,2019-08-01,57,298.7,,,311.6,42,D,23.1,MODIS,-12.83980,132.87440,Terra,3.10,1.70,0.0,6.3
3,2019-08-01,57,296.1,,,310.1,33,D,6.5,MODIS,-14.43060,143.30350,Terra,1.10,1.10,0.0,6.3
4,2019-08-01,57,298.8,,,310.3,36,D,27.6,MODIS,-12.49530,131.48970,Terra,4.00,1.90,0.0,6.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1360634,2020-01-11,1630,,306.9,288.6,,n,N,0.8,VIIRS,-32.66628,122.15253,N,0.39,0.44,,1.0NRT
1360635,2020-01-11,1630,,299.0,287.4,,n,N,0.7,VIIRS,-32.58616,123.39582,N,0.51,0.41,,1.0NRT
1360636,2020-01-11,1630,,309.7,291.7,,n,N,1.0,VIIRS,-33.37853,115.94735,N,0.40,0.60,,1.0NRT
1360637,2020-01-11,1806,,300.2,290.2,,n,N,1.1,VIIRS,-33.21827,115.75078,N,0.49,0.65,,1.0NRT


In [4]:
# Remove unneccessary columns
del concatenated_df["bright_t31"]
del concatenated_df["bright_ti4"]
del concatenated_df["bright_ti5"]
del concatenated_df["confidence"]
del concatenated_df["scan"]
del concatenated_df["track"]
del concatenated_df["version"]
del concatenated_df["type"]

In [5]:
concatenated_df

Unnamed: 0,acq_date,acq_time,brightness,daynight,frp,instrument,latitude,longitude,satellite
0,2019-08-01,56,313.0,D,6.6,MODIS,-11.80700,142.05830,Terra
1,2019-08-01,56,319.3,D,11.3,MODIS,-11.79240,142.08500,Terra
2,2019-08-01,57,311.6,D,23.1,MODIS,-12.83980,132.87440,Terra
3,2019-08-01,57,310.1,D,6.5,MODIS,-14.43060,143.30350,Terra
4,2019-08-01,57,310.3,D,27.6,MODIS,-12.49530,131.48970,Terra
...,...,...,...,...,...,...,...,...,...
1360634,2020-01-11,1630,,N,0.8,VIIRS,-32.66628,122.15253,N
1360635,2020-01-11,1630,,N,0.7,VIIRS,-32.58616,123.39582,N
1360636,2020-01-11,1630,,N,1.0,VIIRS,-33.37853,115.94735,N
1360637,2020-01-11,1806,,N,1.1,VIIRS,-33.21827,115.75078,N


In [6]:
# Verify counts
concatenated_df.count()

acq_date      1360639
acq_time      1360639
brightness     219604
daynight      1175861
frp           1360639
instrument    1360639
latitude      1360639
longitude     1360639
satellite     1360639
dtype: int64

In [7]:
# Drop nas/null values
concatenated_df = concatenated_df.dropna(how="any")

concatenated_df.count()

acq_date      219604
acq_time      219604
brightness    219604
daynight      219604
frp           219604
instrument    219604
latitude      219604
longitude     219604
satellite     219604
dtype: int64

In [8]:
# Check data types
concatenated_df.dtypes

acq_date       object
acq_time        int64
brightness    float64
daynight       object
frp           float64
instrument     object
latitude      float64
longitude     float64
satellite      object
dtype: object

In [9]:
# Convert dataframe to dictionary.
fires_from_space = concatenated_df.to_dict('range')

fires_from_space

[{'acq_date': '2019-08-01',
  'acq_time': 56,
  'brightness': 313.0,
  'daynight': 'D',
  'frp': 6.6,
  'instrument': 'MODIS',
  'latitude': -11.807,
  'longitude': 142.0583,
  'satellite': 'Terra'},
 {'acq_date': '2019-08-01',
  'acq_time': 56,
  'brightness': 319.3,
  'daynight': 'D',
  'frp': 11.3,
  'instrument': 'MODIS',
  'latitude': -11.7924,
  'longitude': 142.085,
  'satellite': 'Terra'},
 {'acq_date': '2019-08-01',
  'acq_time': 57,
  'brightness': 311.6,
  'daynight': 'D',
  'frp': 23.1,
  'instrument': 'MODIS',
  'latitude': -12.8398,
  'longitude': 132.8744,
  'satellite': 'Terra'},
 {'acq_date': '2019-08-01',
  'acq_time': 57,
  'brightness': 310.1,
  'daynight': 'D',
  'frp': 6.5,
  'instrument': 'MODIS',
  'latitude': -14.4306,
  'longitude': 143.3035,
  'satellite': 'Terra'},
 {'acq_date': '2019-08-01',
  'acq_time': 57,
  'brightness': 310.3,
  'daynight': 'D',
  'frp': 27.6,
  'instrument': 'MODIS',
  'latitude': -12.4953,
  'longitude': 131.4897,
  'satellite': 'Ter

## Load into database

In [None]:
client = MongoClient('mongodb://localhost:27017/')

In [None]:
db = client['test']

In [None]:
collection = db.fires_from_space

In [None]:
collection.insert_many(fires_from_space)

# Loop through the list of fires and insert into database in chunks.
# print("Beginning load into database.")
# print("--------------------------------------------")
# count = 0
# set = 1
# for idx, fire in enumerate(fires_from_space):
#     count = count + 1
#     # To avoid inserting so many at once and slowing down the database, insert fire data 50 at a time.
#     # then sleep for 5 seconds, and then continue with next 50 fires and so on...
#     if count == 51:
#         count = 1
#         set = set + 1
#         time.sleep(5)
#     print(f"Processing Record {count} of Set {set}")
#     try:
#         collection.insert_one(fire)
#     except:
#         print("Unable to insert fire data into database. Skipping...")
        
# print("------------------------------------------------")
# print("Loading data into database is complete")
# print("------------------------------------------------")