In [3]:
import pymongo
import json
import random
import uuid
from datetime import datetime, timedelta
from faker import Faker

fake = Faker()

def generate_fake_data(num_records):
    designations = [
        {"name": "Data Scientist", "department": "IT", "skills": ["Python", "Machine Learning", "Data Analysis", "Deep Learning", "API", "Natural Language Processing", "Big Data", "Statistical Modeling"]},
        {"name": "Software Developer", "department": "IT", "skills": ["Java", "Python", "Web Development", "JavaScript", "SQL", "Git", "Agile Methodologies", "Docker"]},
        {"name": "QA Tester", "department": "IT", "skills": ["Testing", "Test Automation", "Bug Tracking", "Selenium", "JIRA", "CI/CD", "Agile Testing", "Performance Testing"]},
        {"name": "Business Analyst", "department": "IT", "skills": ["Data Analysis", "Requirement Gathering", "Documentation", "Business Intelligence", "User Stories", "UML", "SQL", "Tableau"]},
        {"name": "Network Engineer", "department": "IT", "skills": ["Networking", "Security", "Troubleshooting", "Cisco", "Firewalls", "TCP/IP", "VPN", "Wireless Networking"]},
        {"name": "HR Manager", "department": "HR", "skills": ["Recruitment", "Employee Relations", "Training", "Performance Management", "Compensation & Benefits", "HRIS", "Labor Law", "Talent Acquisition"]},
        {"name": "Finance Analyst", "department": "Finance", "skills": ["Financial Analysis", "Accounting", "Budgeting", "Financial Modeling", "Forecasting", "Investment Analysis", "Risk Management", "Taxation"]},
        {"name": "Marketing Specialist", "department": "Marketing", "skills": ["Market Research", "Digital Marketing", "Content Creation", "SEO", "Social Media Marketing", "Email Marketing", "Marketing Analytics", "Brand Management"]},
        {"name": "Operations Manager", "department": "Operations", "skills": ["Process Improvement", "Logistics", "Supply Chain Management", "Lean Manufacturing", "Six Sigma", "Inventory Management", "Project Management", "Quality Management"]},
        {"name": "Sales Executive", "department": "Sales", "skills": ["Sales Strategy", "Client Relationship Management", "Negotiation", "Salesforce", "Lead Generation", "Account Management", "Cold Calling", "Presentation Skills"]}
    ]

    data = []
    for _ in range(num_records):
        designation = random.choice(designations)
        skills = random.sample(designation['skills'], random.randint(3, len(designation['skills'])))
        relevant_experience = round(random.uniform(0, 20), 1)
        salary_in_lpa = max(3, round(relevant_experience / 2, 2))  # Higher experience -> Higher salary
        first_name = fake.first_name()
        last_name = fake.last_name()
        rand_number = fake.random_number(digits=random.randint(2, 3))
        email = f"{first_name.lower()}{last_name.lower()}{rand_number}@neosoft.com"
        age = random.randint(20, 65)
        dob = (datetime.now() - timedelta(days=365*age)).strftime('%Y-%m-%d')
        record = {
            'id': str(uuid.uuid4()),
            'user_id': str(uuid.uuid4()),
            'first_name': first_name,
            'last_name': last_name,
            'email': email,
            'age': age,
            'emp_id': fake.random_number(digits=6),
            'address': fake.address(),
            'department': designation['department'],
            'designation': designation['name'],
            'branch_code': fake.random_number(digits=4),
            'salary_in_lpa': salary_in_lpa,
            'dob': dob,
            'date_of_joining': fake.date_this_decade().strftime('%Y-%m-%d'),
            'relevant_experience': relevant_experience,
            'total_experience': round(random.uniform(relevant_experience, 20), 1),
            'skills': skills
        }
        data.append(record)
    return data

def generate_json_schema():
    schema = {
        # "type": "object",
        "properties": {
            "id": {"type": "string"},
            "user_id": {"type": "string"},
            "first_name": {"type": "string"},
            "last_name": {"type": "string"},
            "email": {"type": "string"},
            "age": {"type": "integer"},
            "emp_id": {"type": "integer"},
            "address": {"type": "string"},
            "department": {"type": "string"},
            "designation": {"type": "string"},
            "branch_code": {"type": "integer"},
            "salary_in_lpa": {"type": "number"},
            "dob": {"type": "string", "format": "date"},
            "date_of_joining": {"type": "string", "format": "date"},
            "relevant_experience": {"type": "number"},
            "total_experience": {"type": "number"},
            "skills": {
                "type": "array",
                "items": {"type": "string"}
            }
        },
        "required": ["id", "user_id", "first_name", "last_name", "email", "age", "emp_id", "address",
                     "department", "designation", "branch_code", "salary_in_lpa", "dob", "date_of_joining",
                     "relevant_experience", "total_experience", "skills"]
    }
    return schema

import urllib
def connect_to_mongodb(username, password):
    parsed = urllib.parse.quote(password)
    # Replace the connection string with your MongoDB connection string
    client = pymongo.MongoClient(f"mongodb+srv://{username}:{parsed}@appdevchatwithdb.xqi9wkd.mongodb.net/?retryWrites=true&w=majority&appName=AppDevChatWithDB")
    # db = client.employee  # Replace 'test_database' with your database name
    return client

def create_collection_with_schema(db, collection_name, json_schema):
    collection = db[collection_name]
    collection.create_index([("$**", pymongo.TEXT)])  # Create a text index for searching
    collection.create_index("id", unique=True)  # Create a unique index for id field
    collection.create_index("user_id", unique=True)  # Create a unique index for user_id field
    collection.create_index("email", unique=True)  # Create a unique index for email field
    collection.create_index("emp_id", unique=True)  # Create a unique index for emp_id field
    collection.create_index("skills")  # Create an index for skills field

    # Insert JSON schema into the database
    db["json_schemas"].insert_one({"collection_name": collection_name, "schema": json_schema})

    return collection

def store_data_in_collection(collection, fake_data):
    collection.insert_many(fake_data)


In [4]:
num_records = 1000
fake_data = generate_fake_data(num_records)
json_schema = generate_json_schema()

In [15]:
client = connect_to_mongodb(username="nayan8625", password="Adminnew@8625")

In [19]:
db = client["sample_mflix"]
collection = "employee"
collection = create_collection_with_schema(db=db,collection_name=collection,
                                           json_schema=json_schema)

store_data_in_collection(collection=collection, fake_data=fake_data)


In [7]:
import os

# Directory you want to create
directory = 'datastore'

# Check if the directory exists, if not, create it
if not os.path.exists(directory):
    os.makedirs(directory)
with open('datastore/json_schema.json', 'w') as f:
    json.dump(json_schema, f, indent=4)

with open('datastore/employee_data.json', 'w') as f:
    json.dump(fake_data, f, indent=4)




In [53]:
client = connect_to_mongodb(username="ankita7870", password="Admin@7870")

In [60]:
import json
from pymongo import MongoClient

# Define MongoDB connection parameters
# mongo_uri = 'mongodb://localhost:27017/'
# db_name = 'your_database_name'
collection_name = 'employee_collection'

def validate_data_with_schema(data, schema):
    if not isinstance(data, list):
        raise ValueError("Data must be a list")

    for record in data:
        for key, value_type in schema['items']['properties'].items():
            if key not in record:
                raise ValueError(f"Missing field '{key}' in record: {record}")
            if not isinstance(record[key], value_type.get('type')):
                raise TypeError(f"Invalid data type for field '{key}' in record: {record}")

    print("Data validation successful")

# # Example usage:
# data = generate_fake_data(10)  # Generate fake data
# schema = generate_json_schema()  # Generate schema
# validate_data_with_schema(data, schema)  # Validate data against schema


# Function to populate MongoDB collection
def populate_collection(json_file, schema_file, mongo_client, db_name):
    # Connect to MongoDB
    client = mongo_client
    db = client[db_name]
    collection = db[collection_name]

    # Load schema
    # with open(schema_file, 'r') as sf:
    schema = schema_file

    # Load JSON data
    # with open(json_file, 'r') as f:
    json_data = json_file

    # Validate and insert data
    for entry in json_data:
        validate_data_with_schema(entry, schema)
        collection.insert_one(entry)

    print("Data inserted successfully.")

# # Path to your JSON file and schema file
# json_file_path = 'path_to_your_json_file.json'
# schema_file_path = 'path_to_your_schema_file.json'

# Call the function to populate the collection
# populate_collection(json_file_path, schema_file_path)



In [68]:
import json
from pymongo import MongoClient

# Define MongoDB connection parameters
# mongo_uri = 'mongodb://localhost:27017/'
# db_name = 'your_database_name'
collection_name = 'employee_collection'

def validate_data_with_schema(data, schema):
    if not isinstance(data, dict):
        raise ValueError("Data must be a dictionary")

    for key, value_info in schema['properties'].items():
        if key not in data:
            raise ValueError(f"Missing field '{key}' in record: {data}")
        
        # Extract the type information from the schema
        value_type = value_info.get('type')
        
        # Check if the data matches the expected type
        if isinstance(value_type, list):
            # If the type is a list of types (e.g., ["string", "null"])
            valid_types = [t for t in value_type if isinstance(data[key], globals()[t])]
            if not valid_types:
                raise TypeError(f"Invalid data type for field '{key}' in record: {data}")
        else:
            # If the type is a single type
            if not isinstance(data[key], globals()[value_type]):
                raise TypeError(f"Invalid data type for field '{key}' in record: {data}")

    print("Data validation successful")


# Function to populate MongoDB collection
def populate_collection(json_data, schema, mongo_client, db_name):
    # Connect to MongoDB
    client = mongo_client
    db = client[db_name]
    collection = db[collection_name]

    # Validate and insert data
    for entry in json_data:
        # validate_data_with_schema(entry, schema)
        collection.insert_one(entry)

    print("Data inserted successfully.")

# Example usage:
# Make sure to provide values for json_data, schema, MongoClient instance, and db_name
# populate_collection(json_data, schema, MongoClient(mongo_uri), db_name)


In [69]:
populate_collection(json_data=fake_data, schema=json_schema, 
                    mongo_client=client, db_name="employee")

Data inserted successfully.
