# Final Project Milestone 1
# Part 3: MongoDB
## Daisy Pinaroc

In [31]:
USER = "XXXX"
PSWRD = "XXXX"
HOST = "XXXX"
DB = "XXXX"

In [32]:
from pymongo import MongoClient

def connect():
    url = "mongodb+srv://{}:{}@{}?retryWrites=true&w=majority".format(USER, PSWRD, HOST)
    connection = MongoClient(url)
    print(connection)
    return connection[DB]

### 1. Test Connection

In [None]:
connection = connect()

print(connection)

In [None]:
# Test connection, show collections in database
connection = connect()

try:
    collections = connection.list_collection_names()

    # Print collections
    print("Collections in the database:")
    for n in collections:
        print(n)
except Exceptions as e:
    print("Error printing collections:", e)

### 2. Write records into collection

In [4]:
import pandas as pd
from faker import Faker

**Importing data for referential integrity purposes**

In [28]:
# To ensure referential integrity, we get the shopper data generated in final-project-mysql.ipynb
# This data was hard-coded in final-project-mysql.ipynb
from gcsfs import GCSFileSystem

# GCS path to the CSV file
gcs_path = 'gs://XXXX'

# Using GCSFileSystem to open the file
fs = GCSFileSystem(project='XXXX')
with fs.open(gcs_path, 'rb') as file:
    # Use pandas to read the CSV file
    shopper_data = pd.read_csv(file, low_memory=False)

try: 
    connection = connect()
    collection = connection["shopper"]
    collection.insert_many(shopper_data.to_dict(orient='records'))
    print("Data synchronized/transferred successfully to MongoDB")
except Exception as e:
    print("Error occurred while synchronizing/transferring data:", e)

MongoClient(host=['ac-xocz8ol-shard-00-00.osacpym.mongodb.net:27017', 'ac-xocz8ol-shard-00-02.osacpym.mongodb.net:27017', 'ac-xocz8ol-shard-00-01.osacpym.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-drupfk-shard-0', tls=True)
Data synchronized/transferred successfully to MongoDB


In [29]:
# Showing num documents in shopper collection

try:
    connection = connect()
    collection = connection["shopper"]
    
    # Get number of documents in shopper collection
    doc_count = collection.count_documents({})

    # Print document count
    print(f"Number of documents in {collection.name}: {doc_count}")
except Exception as e:
    print("Error occurred while counting documents:", e)

MongoClient(host=['ac-xocz8ol-shard-00-00.osacpym.mongodb.net:27017', 'ac-xocz8ol-shard-00-02.osacpym.mongodb.net:27017', 'ac-xocz8ol-shard-00-01.osacpym.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-drupfk-shard-0', tls=True)
Number of documents in shopper: 50


**Populate the ticketing collection with 100 unique records**

In [67]:
from pymongo import MongoClient
from faker import Faker
from faker_airtravel import AirTravelProvider
from datetime import datetime, timedelta, time
from random import randrange

fake = Faker()
fake.add_provider(AirTravelProvider)
connection = connect()
shopper_collection = connection['shopper']

def get_random_shopper():
    try: 
        random_shopper = list(shopper_collection.aggregate([{'$sample': { 'size': 1 } }]))[0]
        return random_shopper
    except IndexError as e:
        print('Unable to retrieve document: No documents found in shopper collection')

# 100 unique records
tickets = []
for _ in range(100):
    # Using fake.flight() from faker_airtravel
    flight_info = fake.flight()
    
    # Get a random shopper for its cust_id
    random_shopper = get_random_shopper()
    
    # Generate random departure and arrival dates
    dep_date = fake.date_between(start_date='-30d', end_date='+30d')
    arr_date = fake.date_between(start_date=dep_date, end_date=dep_date + timedelta(days=randrange(1, 31)))
    
    # Generate random departure time
    dep_time = datetime.strptime(fake.time(), "%H:%M:%S")
    # Add a random duration to the departure time for arrival time
    duration = timedelta(hours=randrange(1, 11))
    arr_time = dep_time + duration 
    
    ticket = {
        "tck_id": fake.uuid4(),
        "cust_id": random_shopper['cust_id'],     # make sure mongodb.ticketing.cust_id exists in mysql.shopper.cust_id
        "airline": flight_info['airline'],
        "flight_nm": fake.random_int(min=1000, max=9999),
        "dep_airport": flight_info['origin']['airport'],
        "arr_airport": flight_info['destination']['airport'],
        "dep_date": datetime.combine(dep_date, datetime.min.time()),
        "dep_time": dep_time,
        "arr_date": datetime.combine(arr_date, datetime.min.time()),
        "arr_time": arr_time,
        "stops": flight_info['stops'],
        "tik_amt": flight_info['price'],
        "curr_code": fake.currency_code(),
    }
    tickets.append(ticket)
    
# inserting the documents
try:
    connection = connect()
    ticketing_collection = connection["ticketing"]
    ticketing_collection.insert_many(tickets)
    print(f"{len(tickets)} documents written into {ticketing_collection.name} collection") 
except Exception as e:
    print("Error writing documents into collection:", e)

MongoClient(host=['ac-xocz8ol-shard-00-00.osacpym.mongodb.net:27017', 'ac-xocz8ol-shard-00-02.osacpym.mongodb.net:27017', 'ac-xocz8ol-shard-00-01.osacpym.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-drupfk-shard-0', tls=True)
MongoClient(host=['ac-xocz8ol-shard-00-00.osacpym.mongodb.net:27017', 'ac-xocz8ol-shard-00-02.osacpym.mongodb.net:27017', 'ac-xocz8ol-shard-00-01.osacpym.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-drupfk-shard-0', tls=True)
100 documents written into ticketing collection


### Hard-coding "ticketing" data for referential integrity

In [71]:
# Write data to a CSV file in memory
import pandas as pd
from pymongo import MongoClient

try:
    connection = connect()
    tickets_collection = connection["ticketing"]
    
    # Get data from tickets collection
    data = list(tickets_collection.find())
    
    # Convert data to a DataFrame
    df = pd.DataFrame(data)
    
    # Write df to CSV file
    df.to_csv("/home/jupyter/final_proj/ticketing_data_mongodb.csv", index=False)
    
    print("Done")
except Exception as e:
    print("Error:", e)


MongoClient(host=['ac-xocz8ol-shard-00-00.osacpym.mongodb.net:27017', 'ac-xocz8ol-shard-00-02.osacpym.mongodb.net:27017', 'ac-xocz8ol-shard-00-01.osacpym.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-drupfk-shard-0', tls=True)
Done


In [72]:
# Upload CSV data to a Google Cloud Storage bucket
from google.cloud import storage

try:
    client = storage.Client()
    bucket = client.get_bucket("XXXX")
    blob = bucket.blob("ticketing_data_mongodb.csv")

    # Upload CSV file to Google Cloud Storage
    blob.upload_from_filename("/home/jupyter/final_proj/ticketing_data_mongodb.csv", content_type='text/csv')

    print(f'Data exported to gs://{bucket}/{blob}')
except Exception as e:
    print("Error occurred while uploading to Google Cloud Storage:", e)

Data exported to gs://<Bucket: cs327e-final-project>/<Blob: cs327e-final-project, ticketing_data_mongodb.csv, 1701489782358612>
