In [6]:
import os
import requests
import pandas as pd
from pymongo import MongoClient, ASCENDING
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# MongoDB Atlas connection setup
def get_mongo_client():
    mongo_uri = os.getenv("MONGO_URI")  # Get the MongoDB URI from .env file
    if not mongo_uri:
        raise ValueError("MONGO_URI is not set in the .env file")
    client = MongoClient(mongo_uri)
    return client

# Download CSV if not exists
def download_csv():
    if not os.path.exists("./FinalPermutations.csv"):
        s3_url = "https://coolmeal.s3.amazonaws.com/FinalPermutations.csv"
        print(f"File not found at ./FinalPermutations.csv. Downloading from S3...")
        response = requests.get(s3_url)

        if response.status_code == 200:
            with open("./FinalPermutations.csv", "wb") as f:
                f.write(response.content)
            print(
                f"File downloaded successfully from S3 and saved to ./FinalPermutations.csv."
            )
        else:
            raise FileNotFoundError(
                f"Failed to download file from {s3_url}. HTTP status code: {response.status_code}"
            )

# Add indexing and load data from CSV to MongoDB in chunks
def load_data_to_mongo_in_chunks():
    # Download the CSV file if not present
    download_csv()

    # Set chunk size (e.g., 1000 records per chunk)
    chunk_size = 1000
    total_inserted = 0  # Keep track of the total number of records inserted

    print(f"Reading data from ./FinalPermutations.csv in chunks of {chunk_size}...")

    # Set up the MongoDB connection
    client = get_mongo_client()
    db = client["cool-server"]
    collection = db["Meal_plans"]

    # Create an index on the 'index' field (optional but improves query performance)
    collection.create_index([('index', ASCENDING)], unique=True)
    print("Index created on the 'index' field.")

    # Read the CSV data in chunks
    for chunk_number, chunk in enumerate(pd.read_csv("./FinalPermutations.csv", chunksize=chunk_size)):
        print(f"Processing chunk {chunk_number + 1}...")

        # Add an 'index' column to each chunk starting from the last inserted index
        chunk['index'] = range(total_inserted, total_inserted + len(chunk))

        # Convert chunk to list of dictionaries
        records_to_insert = chunk.to_dict(orient='records')

        # Insert records into MongoDB
        collection.insert_many(records_to_insert)
        total_inserted += len(records_to_insert)  # Update total records inserted

        print(f"Inserted {len(records_to_insert)} records (Total inserted: {total_inserted})")

    print(f"All data inserted successfully. Total records inserted: {total_inserted}")

# Run the function to load data in chunks
load_data_to_mongo_in_chunks()


Reading data from ./FinalPermutations.csv in chunks of 1000...
Index created on the 'index' field.
Processing chunk 1...
Inserted 1000 records (Total inserted: 1000)
Processing chunk 2...
Inserted 1000 records (Total inserted: 2000)
Processing chunk 3...
Inserted 1000 records (Total inserted: 3000)
Processing chunk 4...
Inserted 1000 records (Total inserted: 4000)
Processing chunk 5...
Inserted 1000 records (Total inserted: 5000)
Processing chunk 6...
Inserted 1000 records (Total inserted: 6000)
Processing chunk 7...
Inserted 1000 records (Total inserted: 7000)
Processing chunk 8...
Inserted 1000 records (Total inserted: 8000)
Processing chunk 9...
Inserted 1000 records (Total inserted: 9000)
Processing chunk 10...
Inserted 1000 records (Total inserted: 10000)
Processing chunk 11...
Inserted 1000 records (Total inserted: 11000)
Processing chunk 12...
Inserted 1000 records (Total inserted: 12000)
Processing chunk 13...
Inserted 1000 records (Total inserted: 13000)
Processing chunk 14...

In [1]:
import os
import pandas as pd
from pymongo import MongoClient, ASCENDING
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# MongoDB Atlas connection setup
def get_mongo_client():
    mongo_uri = os.getenv("MONGO_URI")  # Get the MongoDB URI from .env file
    if not mongo_uri:
        raise ValueError("MONGO_URI is not set in the .env file")
    client = MongoClient(mongo_uri)
    return client

# Function to read CSV from a file path and upload to MongoDB
def upload_csv_to_mongo(file_path, db_name, collection_name):
    # Step 1: Read the CSV file using pandas
    try:
        data = pd.read_csv(file_path)
        print(f"CSV file '{file_path}' read successfully.")
    except Exception as e:
        raise Exception(f"Error reading CSV file: {str(e)}")
    
    # Step 2: Connect to MongoDB
    try:
        client = get_mongo_client()
        db = client[db_name]  # Access the specified database
        collection = db[collection_name]  # Access the specified collection
    except Exception as e:
        raise Exception(f"Error connecting to MongoDB: {str(e)}")
    
    # Step 3: Convert pandas DataFrame to a list of dictionaries (JSON format)
    records = data.to_dict(orient='records')

    # Step 4: Insert data into MongoDB collection
    try:
        result = collection.insert_many(records)
        print(f"Inserted {len(result.inserted_ids)} records into the '{collection_name}' collection.")
    except Exception as e:
        raise Exception(f"Error inserting records into MongoDB: {str(e)}")
    finally:
        client.close()

# Usage example (Replace with actual paths and MongoDB details)
upload_csv_to_mongo("../data-analytics/datasets/meals/DatasetMeals.csv","cool-server", "Meal")



CSV file '../data-analytics/datasets/meals/DatasetMeals.csv' read successfully.
Inserted 174 records into the 'Meal' collection.


In [2]:
# Usage example (Replace with actual paths and MongoDB details)
upload_csv_to_mongo("../data-analytics/datasets/meals/FinalMeals.csv","cool-server", "Final_Meals")

CSV file '../data-analytics/datasets/meals/FinalMeals.csv' read successfully.
Inserted 10468 records into the 'Final_Meals' collection.
