## Database and Jupyter Notebook Set Up

In [1]:
# Import dependencies
from pymongo import MongoClient
from pprint import pprint
import json
import os

In [2]:
# Connect to MongoDB
client = MongoClient(port=27017)
db = client['national_parks_db']

In [3]:
# Drop any previously existing 'national_parks_db' database 
client.drop_database('national_parks_db') 

print("Database 'national_parks_db' has been deleted.")

Database 'national_parks_db' has been deleted.


In [4]:
# Connect to MongoDB and create database
client = MongoClient(port=27017)
db = client['national_parks_db']

In [5]:
# Function to load JSON data into MongoDB from the transformed file
with open('NPS_Cleaned_Data.json', 'r') as file:
    data = json.load(file)

In [6]:
# Create a collection named 'national_parks_collection' and insert the data
collection = db["national_parks_collection"]
collection.insert_many(data)

print("JSON data imported into the national_parks_db database successfully.")

JSON data imported into the national_parks_db database successfully.


In [7]:
# List all collections in the database
collections = db.list_collection_names()
print("Collections in the database:")
for collection in collections:
    print(collection)


Collections in the database:
national_parks_collection


In [8]:
# review a document in the national parks collection
db.national_parks_collection.find_one()

{'_id': ObjectId('669c9b16be1e7c26f8d9eccf'),
 'Park Designation': 'National Park',
 'Park Code': 'badl',
 'Park Name': 'Badlands National Park',
 'Present in States': 'SD',
 'Free Entrance': False,
 'Entrance Fee': 355.0,
 'Park Website': 'http://www.nps.gov/badl/',
 'Activity Option': 'Astronomy',
 'Available Amenity': 'ATM/Cash Machine',
 'Park Latitude': 43.68584846,
 'Park Longitude': -102.482942,
 'Entrance Fees': '$355.00'}

In [10]:
# Find all documents that don't have the designation of "National Park" in the national parks collection
non_national_park_docs = db['national_parks_collection'].find({'park_designation': {'$ne': 'National Park'}})

# Store the IDs of the deleted documents from the national parks collection
non_national_park_ids = [doc['_id'] for doc in non_national_park_docs]

# Delete the documents from the national parks collection
results = db['national parks collection'].delete_many({'park_designation': {'$ne': 'National Park'}})
print(f"Deleted {results.deleted_count} documents from the national_parks_collection.")

Deleted 0 documents from the national_parks_collection.


In [11]:
# Count the documents with the designation of "National Park" present in all three collections: amenities, amenities_places, and parks_data
count_nat_parks = db['national parks collection'].count_documents({})
count_parks_data = db['national parks collection'].count_documents({'park_designation': 'National Park'})

print(f"Number of documents in the national parks collection: {count_nat_parks}")
print(f"Number of documents with the designation 'National Park' in the national parks collection: {count_parks_data}")


Number of documents in the national parks collection: 0
Number of documents with the designation 'National Park' in the national parks collection: 0


### Time to add some data analysis. Tutor suggested using Seaborn library (since we need one we haven't used before in class) in place of MatPlotLib. said they work nearly identically. Planning to use this to generate descriptive stats visualizations (pie charts, bar graphs, etc). Thoughts?