#**🚀 30-Day Python Challenge: Let's do this.**

#Day 1: Variables & Data Types.

In [2]:
# User Story: As a data engineer, I need to define and inspect various pieces of data from a new log file
# so that I can understand its structure and prepare it for a data pipeline.

# 1. Primitive Data Types

# Integer: Used for counts, IDs, and other whole number values.
# In this case, it's the number of records processed in a batch.
batch_id = 101
total_records_processed = 57892
print(f"Variable: 'batch_id', Value: {batch_id}, Type: {type(batch_id)}")
print(f"Variable: 'total_records_processed', Value: {total_records_processed}, Type: {type(total_records_processed)}\n")

# Float: Used for numerical data with a decimal point, such as timestamps or sizes.
# Here, it's the duration of the data pipeline run in minutes.
pipeline_duration_minutes = 12.45
data_transfer_rate_mbps = 98.76
print(f"Variable: 'pipeline_duration_minutes', Value: {pipeline_duration_minutes}, Type: {type(pipeline_duration_minutes)}")
print(f"Variable: 'data_transfer_rate_mbps', Value: {data_transfer_rate_mbps}, Type: {type(data_transfer_rate_mbps)}\n")

# String: Used for text data, such as file paths, messages, or column names.
log_file_name = "data_ingestion_2023-10-26.log"
status_message = "Ingestion process completed successfully."
print(f"Variable: 'log_file_name', Value: '{log_file_name}', Type: {type(log_file_name)}")
print(f"Variable: 'status_message', Value: '{status_message}', Type: {type(status_message)}\n")

# Boolean: Represents a true or false value, often used as flags.
is_data_validated = True
has_failed_records = False
print(f"Variable: 'is_data_validated', Value: {is_data_validated}, Type: {type(is_data_validated)}")
print(f"Variable: 'has_failed_records', Value: {has_failed_records}, Type: {type(has_failed_records)}\n")

# NoneType: Represents the absence of a value. This is a common way to initialize a variable
# or indicate that a value is missing.
error_code = None
print(f"Variable: 'error_code', Value: {error_code}, Type: {type(error_code)}\n")


# 2. Collection Data Types

# List: An ordered and mutable collection of items. Great for storing a sequence of objects.
# In this scenario, it's a list of file paths to be processed.
files_to_process = ["/data/raw/file1.csv", "/data/raw/file2.csv", "/data/raw/file3.csv"]
print(f"Variable: 'files_to_process', Value: {files_to_process}, Type: {type(files_to_process)}")
print(f"First file in the list: {files_to_process[0]}\n")

# Tuple: An ordered and immutable collection of items. Use it for fixed sequences.
# This might be a fixed configuration like a database connection string parts.
database_credentials = ("mydb.hostname.com", 5432, "data_user")
print(f"Variable: 'database_credentials', Value: {database_credentials}, Type: {type(database_credentials)}")
print(f"Database hostname: {database_credentials[0]}\n")

# Dictionary: A collection of key-value pairs. Ideal for representing structured records.
# This represents metadata about a dataset.
dataset_metadata = {
    "name": "sales_data",
    "record_count": 1205600,
    "last_updated": "2023-10-26T14:30:00Z",
    "is_compressed": True
}
print(f"Variable: 'dataset_metadata', Value: {dataset_metadata}, Type: {type(dataset_metadata)}")
print(f"The number of records is: {dataset_metadata['record_count']}\n")

# Set: An unordered collection of unique items. Useful for removing duplicates.
# This could be a set of unique user IDs from a log.
unique_users = {"user_a", "user_b", "user_c", "user_b"}
print(f"Variable: 'unique_users', Value: {unique_users}, Type: {type(unique_users)}")
# Notice that 'user_b' only appears once in the set output.
print(f"Number of unique users: {len(unique_users)}\n")


# 3. Simple Operations and Type Conversions

# You can easily combine or convert between types.
# For example, converting a string to a float.
raw_record_size = "1024.50"
normalized_size = float(raw_record_size)
print(f"Normalized record size: {normalized_size}, Type: {type(normalized_size)}\n")

# Let's check a simple condition using a boolean.
if is_data_validated:
    print("Data validation passed. Ready for the next stage of the pipeline.")
else:
    print("Data validation failed. Halting pipeline.")

Variable: 'batch_id', Value: 101, Type: <class 'int'>
Variable: 'total_records_processed', Value: 57892, Type: <class 'int'>

Variable: 'pipeline_duration_minutes', Value: 12.45, Type: <class 'float'>
Variable: 'data_transfer_rate_mbps', Value: 98.76, Type: <class 'float'>

Variable: 'log_file_name', Value: 'data_ingestion_2023-10-26.log', Type: <class 'str'>
Variable: 'status_message', Value: 'Ingestion process completed successfully.', Type: <class 'str'>

Variable: 'is_data_validated', Value: True, Type: <class 'bool'>
Variable: 'has_failed_records', Value: False, Type: <class 'bool'>

Variable: 'error_code', Value: None, Type: <class 'NoneType'>

Variable: 'files_to_process', Value: ['/data/raw/file1.csv', '/data/raw/file2.csv', '/data/raw/file3.csv'], Type: <class 'list'>
First file in the list: /data/raw/file1.csv

Variable: 'database_credentials', Value: ('mydb.hostname.com', 5432, 'data_user'), Type: <class 'tuple'>
Database hostname: mydb.hostname.com

Variable: 'dataset_metad

#Day 2: Operators

In [3]:
# User Story: As a data engineer, I need to perform calculations and comparisons
# on a data batch to check its integrity and determine its final status.

# 1. Defining Our Data
# Imagine these values come from a data pipeline log.
records_in_batch = 1000
records_processed = 980
error_records = 20
processing_time_sec = 125

# 2. Arithmetic Operators

# Calculate the percentage of processed records.
# Note: The result will be a float.
processing_percentage = (records_processed / records_in_batch) * 100
print(f"Processing Percentage: {processing_percentage:.2f}%\n") # The .2f formats the float to 2 decimal places.

# Calculate the average time to process each record.
avg_time_per_record_sec = processing_time_sec / records_processed
print(f"Average time per record: {avg_time_per_record_sec:.4f} seconds\n")

# Use floor division to find how many full batches of 500 records were processed.
# This discards the remainder.
full_500_record_batches = records_processed // 500
print(f"Number of full 500-record batches processed: {full_500_record_batches}\n")

# Use modulus to find the number of remaining records.
remaining_records = records_processed % 500
print(f"Number of remaining records after full batches: {remaining_records}\n")


# 3. Comparison Operators

# Check if the number of processed records is equal to the expected amount.
expected_records = 980
is_processed_count_correct = (records_processed == expected_records)
print(f"Is processed count correct? {is_processed_count_correct}")

# Check if there are any error records.
has_errors = (error_records > 0)
print(f"Does the batch have errors? {has_errors}")

# Check if the processing time is within a 2-minute threshold (120 seconds).
is_within_time_limit = (processing_time_sec <= 120)
print(f"Is processing time within the limit? {is_within_time_limit}\n")


# 4. Logical Operators

# Combine conditions to make a final decision about the batch status.
# The `and` operator requires both conditions to be True.
is_batch_successful = is_processed_count_correct and not has_errors
print(f"Is the batch considered successful? {is_batch_successful}")

# The `or` operator is True if at least one condition is True.
is_critical_issue = has_errors or (processing_time_sec > 300)
print(f"Is there a critical issue with the batch? {is_critical_issue}")

Processing Percentage: 98.00%

Average time per record: 0.1276 seconds

Number of full 500-record batches processed: 1

Number of remaining records after full batches: 480

Is processed count correct? True
Does the batch have errors? True
Is processing time within the limit? False

Is the batch considered successful? False
Is there a critical issue with the batch? True


#Day 3 : Conditional Statements

In [4]:
# User Story: As a data engineer, I need to check the status of a data transfer
# and execute different actions based on whether it succeeded, failed, or is still in progress.

# 1. Defining Our Data
# Imagine these values are read from a pipeline monitoring system.
transfer_status = "succeeded"  # Can be "succeeded", "failed", or "in progress"
file_size_gb = 5.7
transfer_speed_mbps = 85.0

# 2. Basic 'if', 'elif', 'else' Statement

# We check the status to determine the next step in our pipeline.
if transfer_status == "succeeded":
    print("Transfer successful. Proceeding to data validation phase.")
elif transfer_status == "failed":
    print("Transfer failed. Notifying the on-call team and retrying the job.")
else:
    print("Transfer is still in progress. Monitoring for completion...")


# 3. Nested 'if' and Combining Conditions

# We can add more specific checks inside a condition.
if transfer_status == "succeeded":
    # Using a comparison and logical operator
    if file_size_gb > 5 and transfer_speed_mbps < 100:
        print("Large file transferred successfully, but speed was below optimal. Need to investigate.")
    else:
        print("Large file transferred successfully with optimal speed.")
elif transfer_status == "failed":
    # Another condition check
    if file_size_gb > 10:
        print("Large file transfer failed. Escalating issue to a senior engineer.")
    else:
        print("File transfer failed. Retrying with standard procedure.")
else:
    print("No further action needed at this time.")


# 4. Using Booleans and Conditional Expressions (Ternary Operator)

# A boolean flag can simplify conditional logic.
is_transfer_complete = transfer_status == "succeeded"
if is_transfer_complete:
    print("\nTransfer completed flag is True.")
else:
    print("\nTransfer completed flag is False.")

# A more concise way to write a simple conditional.
message = "Ready for processing" if is_transfer_complete else "Awaiting transfer completion"
print(f"Status message: {message}")


Transfer successful. Proceeding to data validation phase.
Large file transferred successfully, but speed was below optimal. Need to investigate.

Transfer completed flag is True.
Status message: Ready for processing


#Day 4 Loops: for & while


In [6]:
# User Story: As a data engineer, I need to perform a task a specific number of times,
# such as processing a set number of records or retrying a failed job.

# 1. 'for' Loop: Iterating a specific number of times

print("--- Using 'for' loop to process a specific number of records ---")
# This loop will run 5 times, with the 'record_number' variable going from 0 to 4.
for record_number in range(5):
    print(f"   -> Processing record number {record_number + 1}...")

    # We can use an if statement inside the loop to simulate a check.
    if record_number == 2:
        print("      -> A special condition was met for this record. Taking action.")

print("\nAll records processed using the 'for' loop.\n")


# 2. 'while' Loop: Looping based on a condition

# This is useful for monitoring a state, like waiting for a queue to be empty.
records_to_process = 5
processed_count = 0

print("--- Using 'while' loop to monitor a queue ---")
print(f"Initial records in queue: {records_to_process}")

while processed_count < records_to_process:
    processed_count += 1
    print(f"   -> Processed record {processed_count}. Records remaining: {records_to_process - processed_count}")
    # In a real pipeline, this would involve calling a function to process a record.

print("\nAll records have been processed. The queue is empty.")


--- Using 'for' loop to process a specific number of records ---
   -> Processing record number 1...
   -> Processing record number 2...
   -> Processing record number 3...
      -> A special condition was met for this record. Taking action.
   -> Processing record number 4...
   -> Processing record number 5...

All records processed using the 'for' loop.

--- Using 'while' loop to monitor a queue ---
Initial records in queue: 5
   -> Processed record 1. Records remaining: 4
   -> Processed record 2. Records remaining: 3
   -> Processed record 3. Records remaining: 2
   -> Processed record 4. Records remaining: 1
   -> Processed record 5. Records remaining: 0

All records have been processed. The queue is empty.


In [7]:
# User Story: As a data engineer, I need to automate a daily data ingestion process
# and build a retry mechanism for failed jobs.

# 1. 'for' Loop: Simulating a daily scheduled job

print("--- Using 'for' loop to simulate a daily ingestion schedule ---")
# We want to run a daily ingestion job for a period of 7 days.
# The 'day_number' variable will be an integer from 1 to 7.
for day_number in range(1, 8):
    print(f"   -> Starting ingestion job for Day {day_number}...")

    # We can use a simple conditional check to simulate a successful or failed run.
    # In a real pipeline, this would be determined by a function's return value.
    if day_number == 4:
        print(f"      -> Ingestion on Day {day_number} failed. Manual review required.")
    else:
        print(f"      -> Ingestion on Day {day_number} completed successfully.")

print("\nDaily ingestion schedule for the week has finished.\n")


# 2. 'while' Loop: Building a retry mechanism

# This is a common pattern for handling temporary issues, like a network outage.
# We will simulate a job that might fail and then retry up to a maximum number of attempts.
is_successful = False
retry_attempts = 0
max_attempts = 3

print("--- Using 'while' loop for a job retry mechanism ---")

# The loop continues as long as the job is not successful AND we haven't
# exceeded our maximum number of retries.
while not is_successful and retry_attempts < max_attempts:
    retry_attempts += 1
    print(f"   -> Attempting to connect to data source... (Attempt {retry_attempts} of {max_attempts})")

    # In a real scenario, a function call here would return True or False.
    # We'll simulate a failure on the first two attempts.
    if retry_attempts < max_attempts:
        print("      -> Connection failed. Retrying...")
    else:
        print("      -> Connection successful! Data transfer can begin.")
        is_successful = True

# After the loop, we can check the final status.
if is_successful:
    print("\nJob completed successfully after retries.")
else:
    print("\nJob failed after all retry attempts. Notifying engineer.")


--- Using 'for' loop to simulate a daily ingestion schedule ---
   -> Starting ingestion job for Day 1...
      -> Ingestion on Day 1 completed successfully.
   -> Starting ingestion job for Day 2...
      -> Ingestion on Day 2 completed successfully.
   -> Starting ingestion job for Day 3...
      -> Ingestion on Day 3 completed successfully.
   -> Starting ingestion job for Day 4...
      -> Ingestion on Day 4 failed. Manual review required.
   -> Starting ingestion job for Day 5...
      -> Ingestion on Day 5 completed successfully.
   -> Starting ingestion job for Day 6...
      -> Ingestion on Day 6 completed successfully.
   -> Starting ingestion job for Day 7...
      -> Ingestion on Day 7 completed successfully.

Daily ingestion schedule for the week has finished.

--- Using 'while' loop for a job retry mechanism ---
   -> Attempting to connect to data source... (Attempt 1 of 3)
      -> Connection failed. Retrying...
   -> Attempting to connect to data source... (Attempt 2 of 

In [8]:
# User Story: As a data engineer, I sometimes need to perform mathematical
# calculations on scalar values, such as calculating the number of possible
# combinations for a data batch (factorial) or generating a sequence for a
# specific data pattern (Fibonacci).

# --- Part 1: Calculating a Factorial ---
# A factorial is the product of all positive integers up to a number.
# For example, 5! = 5 * 4 * 3 * 2 * 1 = 120.
# We will use a 'for' loop to perform this calculation.

print("--- Calculating Factorial ---")
# The number for which we want to calculate the factorial.
number = 7

# We initialize a variable to hold the result.
factorial_result = 1

# We loop from the number down to 1. The 'range' function is exclusive of the end value.
for i in range(1, number + 1):
    factorial_result = factorial_result * i
    print(f"  -> Intermediate result for {i}: {factorial_result}")

print(f"\nThe factorial of {number} is: {factorial_result}\n")


# --- Part 2: Generating the Fibonacci Sequence ---
# The Fibonacci sequence is a series where each number is the sum of the two
# preceding ones, starting from 0 and 1. (0, 1, 1, 2, 3, 5, 8, ...)
# We will use a 'while' loop to generate this sequence up to a certain number of terms.

print("--- Generating Fibonacci Sequence ---")
# The number of terms we want to generate.
num_terms = 10

# Initialize the first two terms of the sequence.
a = 0
b = 1

# A counter to track how many terms we have printed.
count = 0

# The 'while' loop will continue as long as our count is less than the desired number of terms.
while count < num_terms:
    print(a, end=" ") # 'end=" "' keeps the output on the same line.

    # Calculate the next term.
    next_term = a + b

    # Update the values for the next iteration.
    a = b
    b = next_term

    # Increment the counter.
    count += 1
print("\n")


# --- Part 3: Using 'sum' for Aggregation ---
# User Story: A data engineer needs to calculate the total number of records
# processed across several parallel jobs.

print("--- Calculating Sum of Records ---")
# For now, we'll represent job records as individual numbers.
# Later, you will learn to use lists for this, but for now, we'll use a simple loop.
job_1_records = 150
job_2_records = 220
job_3_records = 95
job_4_records = 300

total_records = sum([job_1_records, job_2_records, job_3_records, job_4_records])
print(f"The total number of records processed is: {total_records}")
print("\n")


# --- Part 4: Using 'min' and 'max' for Data Range ---
# User Story: A data engineer needs to find the smallest and largest file sizes
# within a batch to identify anomalies.

print("--- Finding Min/Max File Size ---")
# Similar to the sum, we will use individual variables to simulate file sizes.
# The 'min()' and 'max()' functions can find the smallest and largest values.
file_size_1 = 1024
file_size_2 = 512
file_size_3 = 2048
file_size_4 = 1536

min_size = min(file_size_1, file_size_2, file_size_3, file_size_4)
max_size = max(file_size_1, file_size_2, file_size_3, file_size_4)

print(f"The minimum file size is: {min_size} KB")
print(f"The maximum file size is: {max_size} KB")


--- Calculating Factorial ---
  -> Intermediate result for 1: 1
  -> Intermediate result for 2: 2
  -> Intermediate result for 3: 6
  -> Intermediate result for 4: 24
  -> Intermediate result for 5: 120
  -> Intermediate result for 6: 720
  -> Intermediate result for 7: 5040

The factorial of 7 is: 5040

--- Generating Fibonacci Sequence ---
0 1 1 2 3 5 8 13 21 34 

--- Calculating Sum of Records ---
The total number of records processed is: 765


--- Finding Min/Max File Size ---
The minimum file size is: 512 KB
The maximum file size is: 2048 KB


# Calculator Application

In [9]:
# User Story: As a data engineer, I often need to perform quick calculations to
# validate data or check values in a pipeline. A simple, ad-hoc calculator
# helps me quickly get these results without a spreadsheet.

# --- Part 1: Getting User Input ---
# We use the 'input()' function to get data from the user.
# IMPORTANT: The input() function always returns a string. We must
# convert it to a number (float) to perform mathematical operations.

print("--- Simple Python Calculator ---")

# We use a try-except block to handle cases where the user enters non-numeric input.
try:
    # Get the first number from the user.
    num1_str = input("Enter the first number: ")
    num1 = float(num1_str)

    # Get the second number from the user.
    num2_str = input("Enter the second number: ")
    num2 = float(num2_str)

    # Get the desired operation.
    operation = input("Enter the operation (+, -, *, /): ")

    # --- Part 2: Performing the Calculation ---
    # We use conditional statements (if/elif/else) to choose the correct operation.
    if operation == '+':
        result = num1 + num2
        print(f"Result: {num1} + {num2} = {result}")
    elif operation == '-':
        result = num1 - num2
        print(f"Result: {num1} - {num2} = {result}")
    elif operation == '*':
        result = num1 * num2
        print(f"Result: {num1} * {num2} = {result}")
    elif operation == '/':
        # We need to handle division by zero separately to avoid an error.
        if num2 == 0:
            print("Error: Cannot divide by zero.")
        else:
            result = num1 / num2
            print(f"Result: {num1} / {num2} = {result}")
    else:
        # This handles any invalid operation input.
        print("Error: Invalid operation.")

# This block catches the error if the user input for the numbers cannot be converted to a float.
except ValueError:
    print("Error: Please enter valid numbers.")


--- Simple Python Calculator ---
Enter the first number: 45
Enter the second number: 69
Enter the operation (+, -, *, /): +
Result: 45.0 + 69.0 = 114.0


In [10]:
# User Story: As a data engineer, I need to perform basic data quality checks
# on a batch of incoming records. I need to iterate through each record,
# check if it meets certain criteria, and then perform a simple transformation
# or aggregation based on the results.

# This application demonstrates the core concepts learned through Day 4:
# - Variables and Data Types
# - Operators and Expressions
# - Conditional Statements (if/elif/else)
# - Loops (for)

# --- Part 1: Setting up the initial variables and parameters ---
# We'll simulate a data pipeline by setting up some key variables.
batch_size = 20  # Total number of records to process
threshold = 10   # A value used for a data quality check
valid_records = 0
invalid_records = 0

print(f"--- Data Quality Check: Processing a batch of {batch_size} records ---")
print(f"Threshold for valid data: > {threshold}\n")


# --- Part 2: Looping through the data records ---
# We use a 'for' loop to simulate iterating over each record in the batch.
# The 'range(1, batch_size + 1)' generates numbers from 1 to 20, representing
# our record IDs.
for record_id in range(1, batch_size + 1):
    # Simulate a value for each record. For this example, we'll use a simple calculation.
    # In a real pipeline, this value would be read from a file or database.
    # Here, the value is just the record ID itself.
    record_value = record_id

    # --- Part 3: Performing conditional checks ---
    # We use 'if/elif/else' to apply our data quality rules.
    if record_value > threshold:
        # If the value is above the threshold, it's considered valid.
        print(f"Record {record_id}: Value of {record_value} is VALID.")
        # We increment the count of valid records.
        valid_records = valid_records + 1
    elif record_value == threshold:
        # If the value is exactly the threshold, it's a special case.
        print(f"Record {record_id}: Value of {record_value} is a BORDERLINE case.")
    else:
        # If the value is below the threshold, it's considered invalid.
        print(f"Record {record_id}: Value of {record_value} is INVALID.")
        # We increment the count of invalid records.
        invalid_records = invalid_records + 1


# --- Part 4: Final Summary and Aggregation ---
# After the loop finishes, we print a summary of the processed batch.
# This part uses the aggregated variables and a final conditional statement.
print("\n--- Batch Processing Summary ---")
print(f"Total Records Processed: {valid_records + invalid_records}")
print(f"Valid Records: {valid_records}")
print(f"Invalid Records: {invalid_records}")

# Use a final check to see if the batch passed or failed quality standards.
if valid_records > invalid_records:
    print("\nCONCLUSION: The batch passed the quality check.")
else:
    print("\nCONCLUSION: The batch failed the quality check. Further investigation needed.")


--- Data Quality Check: Processing a batch of 20 records ---
Threshold for valid data: > 10

Record 1: Value of 1 is INVALID.
Record 2: Value of 2 is INVALID.
Record 3: Value of 3 is INVALID.
Record 4: Value of 4 is INVALID.
Record 5: Value of 5 is INVALID.
Record 6: Value of 6 is INVALID.
Record 7: Value of 7 is INVALID.
Record 8: Value of 8 is INVALID.
Record 9: Value of 9 is INVALID.
Record 10: Value of 10 is a BORDERLINE case.
Record 11: Value of 11 is VALID.
Record 12: Value of 12 is VALID.
Record 13: Value of 13 is VALID.
Record 14: Value of 14 is VALID.
Record 15: Value of 15 is VALID.
Record 16: Value of 16 is VALID.
Record 17: Value of 17 is VALID.
Record 18: Value of 18 is VALID.
Record 19: Value of 19 is VALID.
Record 20: Value of 20 is VALID.

--- Batch Processing Summary ---
Total Records Processed: 19
Valid Records: 10
Invalid Records: 9

CONCLUSION: The batch passed the quality check.


#Day 5 Tuples & Sets

In [11]:
# User Story: As a data engineer, I need to handle data that should not be
# changed (immutable), like database connection details, or filter for unique
# values in a dataset, like a list of distinct users.

# --- Part 1: Tuples - Immutable Data ---
# Tuples are ordered collections of items, but they are immutable, which means
# you cannot change, add, or remove items after creation. They are often used
# for fixed data, like coordinates or configuration settings.

print("--- Using Tuples for Immutable Data ---")
# Example: A tuple to store database connection information.
# The host, port, and user should not be accidentally changed.
db_connection = ("localhost", 5432, "data_pipeline_user")
print(f"Database connection details: {db_connection}")

# You can access elements by their index, just like a list.
print(f"Database host: {db_connection[0]}")
print(f"Database port: {db_connection[1]}")

# If you try to change a value, it will result in a TypeError.
# Try uncommenting the line below to see the error:
# db_connection[1] = 5433

# --- Part 2: Sets - Unique Data ---
# Sets are unordered collections of unique items. They are incredibly
# efficient for tasks like removing duplicate values or checking for
# membership.

print("\n--- Using Sets for Unique Data ---")
# Example: A list of user IDs from a log file, which may have duplicates.
user_ids_raw = ["u101", "u102", "u103", "u102", "u104", "u101"]
print(f"Original user IDs (with duplicates): {user_ids_raw}")

# To get the unique user IDs, we can simply convert the list to a set.
unique_user_ids = set(user_ids_raw)
print(f"Unique user IDs (as a set): {unique_user_ids}")

# You can also perform set operations. For example, to find users in one batch
# but not another, or to find the intersection of two sets of users.
batch_1_users = {"u101", "u102", "u103"}
batch_2_users = {"u102", "u104", "u105"}

# Find users common to both batches.
common_users = batch_1_users.intersection(batch_2_users)
print(f"Common users between batch 1 and 2: {common_users}")

# Find users who are in batch 1 but not in batch 2.
new_users = batch_1_users.difference(batch_2_users)
print(f"Users only in batch 1: {new_users}")


--- Using Tuples for Immutable Data ---
Database connection details: ('localhost', 5432, 'data_pipeline_user')
Database host: localhost
Database port: 5432

--- Using Sets for Unique Data ---
Original user IDs (with duplicates): ['u101', 'u102', 'u103', 'u102', 'u104', 'u101']
Unique user IDs (as a set): {'u102', 'u103', 'u104', 'u101'}
Common users between batch 1 and 2: {'u102'}
Users only in batch 1: {'u103', 'u101'}


#Day 5 : Tuples Practice

In [12]:
# User Story: As a data engineer, I have a batch of fixed metadata for
# each job run, such as the start time, end time, and status. This data
# should not change.

# --- Part 1: Immutable Data with Tuples ---
# A tuple is perfect for storing data that should remain constant.
print("--- Tuple Practice: Job Run Metadata ---")
# Example: A tuple representing a single job's metadata.
job_run_1 = ("job_id_abc", "2024-05-18T10:00:00Z", "SUCCESS")
job_run_2 = ("job_id_xyz", "2024-05-18T11:30:00Z", "FAILED")
print(f"Job 1 Metadata: {job_run_1}")
print(f"Job 2 Metadata: {job_run_2}")

# Accessing elements is straightforward using indexing.
print(f"Status of Job 2: {job_run_2[2]}")

# --- Part 2: Tuple Functions for Data Analysis ---
# Tuples have a few methods that are useful for querying information
# without changing the data.

print("\n--- Tuple Functions ---")
# Example: A tuple of error codes from a batch log.
error_codes = (200, 404, 500, 200, 403, 500, 500)
print(f"Error Codes Tuple: {error_codes}")

# count(): Counts how many times an item appears.
print(f"Number of 200 (success) codes: {error_codes.count(200)}")
print(f"Number of 500 (server error) codes: {error_codes.count(500)}")

# index(): Finds the first position of a specific item.
# This is useful for finding the first occurrence of an error.
try:
    first_server_error_index = error_codes.index(500)
    print(f"First 500 error occurred at index: {first_server_error_index}")
except ValueError:
    print("500 error was not found in the tuple.")


--- Tuple Practice: Job Run Metadata ---
Job 1 Metadata: ('job_id_abc', '2024-05-18T10:00:00Z', 'SUCCESS')
Job 2 Metadata: ('job_id_xyz', '2024-05-18T11:30:00Z', 'FAILED')
Status of Job 2: FAILED

--- Tuple Functions ---
Error Codes Tuple: (200, 404, 500, 200, 403, 500, 500)
Number of 200 (success) codes: 2
Number of 500 (server error) codes: 3
First 500 error occurred at index: 2


#Sets Practice

In [13]:
# User Story: As a data engineer, I need to identify unique user IDs from a
# large, raw log file. I also need to compare different log files to see which
# users they have in common or which are exclusive to one file.

# --- Part 1: Unique Data with Sets ---
# Sets are unordered collections that only store unique values. They are
# extremely efficient for removing duplicates.

print("--- Set Practice: Finding Unique Users ---")
# Example: A list of user IDs from a raw log, containing duplicates.
raw_user_log = ["user_1", "user_2", "user_1", "user_3", "user_4", "user_2"]
print(f"Raw user log (with duplicates): {raw_user_log}")

# Convert the list to a set to get a unique list of users.
unique_users = set(raw_user_log)
print(f"Unique users from the log: {unique_users}")

# --- Part 2: Set Operations for Comparisons ---
# Sets are powerful for comparing different collections of data.

print("\n--- Set Operations ---")
# Example: Two sets representing unique users from two different log files.
file_1_users = {"user_1", "user_2", "user_3", "user_5"}
file_2_users = {"user_3", "user_4", "user_5", "user_6"}

# intersection(): Find common users between the two files.
common_users = file_1_users.intersection(file_2_users)
print(f"Users found in both files (common users): {common_users}")

# union(): Combine all unique users from both files.
all_unique_users = file_1_users.union(file_2_users)
print(f"All unique users from both files: {all_unique_users}")

# difference(): Find users who are in the first set but not the second.
exclusive_to_file_1 = file_1_users.difference(file_2_users)
print(f"Users exclusive to File 1: {exclusive_to_file_1}")


--- Set Practice: Finding Unique Users ---
Raw user log (with duplicates): ['user_1', 'user_2', 'user_1', 'user_3', 'user_4', 'user_2']
Unique users from the log: {'user_2', 'user_4', 'user_3', 'user_1'}

--- Set Operations ---
Users found in both files (common users): {'user_5', 'user_3'}
All unique users from both files: {'user_4', 'user_3', 'user_2', 'user_5', 'user_1', 'user_6'}
Users exclusive to File 1: {'user_2', 'user_1'}


#Data Pipeline Mini Project

In [17]:
# --- User Story ---
# As a data engineer, I need to process raw call detail records (CDRs)
# from a telecom network to identify valid calls, calculate total call
# duration, and count the number of unique subscribers.

# The data is messy. I need to:
# 1. Remove any entries with missing or invalid data.
# 2. Calculate the total call duration in minutes for valid records.
# 3. Count the number of unique subscribers.
# 4. Generate a summary report with key metrics.

# --- Part 1: Raw Data & Initial Setup ---
# Our raw data is a list of dictionaries, representing call records.
# Some records are missing key information or have incorrect data types.
print("--- 1. Initializing Raw Call Records ---")
raw_call_records = [
    {"call_id": "C001", "subscriber_id": "S101", "duration_minutes": 5.5, "status": "COMPLETED"},
    {"call_id": "C002", "subscriber_id": "S102", "duration_minutes": 10.2, "status": "COMPLETED"},
    {"call_id": "C003", "subscriber_id": "S101", "duration_minutes": 3.0, "status": "COMPLETED"},
    {"call_id": "C004", "subscriber_id": "S103", "duration_minutes": None, "status": "COMPLETED"}, # Invalid: duration is None
    {"call_id": "C005", "subscriber_id": "S104", "duration_minutes": 15.0, "status": "FAILED"},
    {"call_id": "C006", "subscriber_id": None, "duration_minutes": 8.5, "status": "COMPLETED"},  # Invalid: subscriber_id is None
    {"call_id": "C007", "subscriber_id": "S105", "duration_minutes": 2.0, "status": "COMPLETED"},
    {"call_id": "C008", "subscriber_id": "S102", "duration_minutes": "Unknown", "status": "COMPLETED"} # Invalid: duration is a string
]
print(f"Total raw records: {len(raw_call_records)}\n")


# --- Part 2: Data Cleaning Function ---
# We'll use a function to clean the data. This demonstrates using
# functions, loops, and conditional statements.

def clean_data(data):
    """
    Cleans a list of call records by validating each entry.
    """
    cleaned_records = []
    invalid_records_count = 0

    # Use a for loop to iterate over each record in the data list.
    for record in data:
        # Check for valid conditions using 'if' and 'and' operators.
        # This checks if the 'duration_minutes' and 'subscriber_id' are present and valid.
        if (record.get("duration_minutes") is not None and
            record.get("subscriber_id") is not None and
            isinstance(record.get("duration_minutes"), (int, float))):
            cleaned_records.append(record)
        else:
            invalid_records_count += 1

    # Return the clean data and a count of invalid records.
    return cleaned_records, invalid_records_count

# --- Part 3: Data Processing Function ---
# This function calculates key metrics from the cleaned data.
# It uses lists, sets, and dictionaries.

def process_data(data):
    """
    Processes cleaned call records to generate a summary report.
    """
    total_call_duration = 0.0

    # Use a set to automatically handle unique subscriber IDs.
    unique_subscribers = set()

    # Loop through the cleaned data.
    for record in data:
        # Add the subscriber ID to our set to find unique subscribers.
        unique_subscribers.add(record['subscriber_id'])

        # Use an arithmetic operator to sum the call duration.
        total_call_duration += record['duration_minutes']

    # Create a dictionary to hold our final report.
    # This is a perfect use case for key-value pairs.
    report = {
        "valid_records_count": len(data),
        "unique_subscriber_count": len(unique_subscribers),
        "total_call_duration_minutes": total_call_duration,
        "average_call_duration": total_call_duration / len(data) if len(data) > 0 else 0
    }

    # Return the final report dictionary.
    return report

# --- Part 4: Main Program Execution ---
# This is where the pipeline runs, orchestrating the functions.
print("--- 2. Cleaning Data... ---")
cleaned_data, invalid_count = clean_data(raw_call_records)
print(f"Removed {invalid_count} invalid records.")
print(f"Remaining valid records: {len(cleaned_data)}\n")

print("--- 3. Processing Data and Generating Report ---")
final_report = process_data(cleaned_data)

# --- Part 5: Final Report ---
# Print the results in a readable format.
print("\n--- Final Telecom Report ---")
# Use a for loop to iterate through the report dictionary's items.
for metric, value in final_report.items():
    if isinstance(value, float):
        print(f"{metric.replace('_', ' ').title()}: {value:,.2f} minutes")
    else:
        print(f"{metric.replace('_', ' ').title()}: {value}")


--- 1. Initializing Raw Call Records ---
Total raw records: 8

--- 2. Cleaning Data... ---
Removed 3 invalid records.
Remaining valid records: 5

--- 3. Processing Data and Generating Report ---

--- Final Telecom Report ---
Valid Records Count: 5
Unique Subscriber Count: 4
Total Call Duration Minutes: 35.70 minutes
Average Call Duration: 7.14 minutes


#Day 6 : Modules & Packages

In [18]:
# --- User Story ---
# As a data engineer, I need to list all files in a specific directory
# and then read data from a CSV file.

# This requires using built-in Python modules.
# We will use the 'os' module to interact with the operating system,
# and the 'csv' module to handle comma-separated values.

# --- Part 1: Using the 'os' Module ---
# The 'os' module is a standard library for interacting with the operating system.
# It's crucial for any data engineer who works with files and directories.
import os

# Let's assume our data is in a sub-directory called 'temp_data'.
# We can create it if it doesn't exist.
directory_path = "temp_data"
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

# Print all files and directories in the current working directory.
print("--- Listing Contents of the Current Directory ---")
# The os.listdir() function returns a list of all entries.
contents = os.listdir(".")
print(f"Directory contents: {contents}")

# --- Part 2: Creating a Sample CSV File ---
# We need a file to read. We'll use a string for now, but in a real scenario,
# this would be an actual file on disk.
sample_csv_data = """id,name,value
1,alpha,100
2,beta,200
3,gamma,150
"""

# Let's save this data to a temporary file.
temp_file_path = os.path.join(directory_path, "sample_data.csv")
with open(temp_file_path, "w") as f:
    f.write(sample_csv_data)
print(f"\nCreated a temporary CSV file at: {temp_file_path}")

# --- Part 3: Using the 'csv' Module ---
# The 'csv' module is part of Python's standard library and is
# designed specifically for reading and writing CSV files.
import csv
import io # We use io.StringIO to treat our string data like a file.

# Use a 'with' statement to open and automatically close the file.
print("\n--- Reading Data from the CSV File ---")

# In a real-world scenario, you would open the file directly like this:
# with open('path/to/file.csv', 'r') as csv_file:
#     csv_reader = csv.reader(csv_file)

# For this example, we use io.StringIO to read from our in-memory string.
csv_reader = csv.reader(io.StringIO(sample_csv_data))

# A for loop is perfect for iterating over the rows in the CSV.
for row in csv_reader:
    # Each row is a list of strings.
    print(f"Row read: {row}")

# Clean up the temporary directory after we're done.
os.remove(temp_file_path)
os.rmdir(directory_path)
print("\nCleaned up temporary directory.")


--- Listing Contents of the Current Directory ---
Directory contents: ['.config', 'temp_data', 'sample_data']

Created a temporary CSV file at: temp_data/sample_data.csv

--- Reading Data from the CSV File ---
Row read: ['id', 'name', 'value']
Row read: ['1', 'alpha', '100']
Row read: ['2', 'beta', '200']
Row read: ['3', 'gamma', '150']

Cleaned up temporary directory.


#Day 7 : String & String Methods

In [19]:
# --- User Story ---
# As a data engineer, I've received a batch of unstructured log data.
# Each log entry contains a timestamp, log level, and a message.
# I need to clean up this raw data and extract specific pieces of
# information for analysis.

# We will use various string methods to accomplish this task.

# --- Part 1: Sample Log Data ---
raw_log_entry = "  [2023-10-26 08:30:15] INFO:  'User 123' processed 10 records.   "

# --- Part 2: Cleaning the String ---
# We need to remove any leading or trailing whitespace.
# The .strip() method is perfect for this.
cleaned_log_entry = raw_log_entry.strip()
print("--- Cleaning the Raw String ---")
print(f"Raw: '{raw_log_entry}'")
print(f"Cleaned: '{cleaned_log_entry}'\n")

# --- Part 3: Finding and Slicing ---
# We want to isolate the log level ('INFO').
# The .find() method returns the index of a substring.
# Slicing allows us to extract a part of the string.
start_index = cleaned_log_entry.find("INFO:")
end_index = cleaned_log_entry.find(":", start_index) + 1
log_level = cleaned_log_entry[start_index:end_index]
print("--- Finding & Slicing ---")
print(f"Extracted Log Level: '{log_level}'\n")

# --- Part 4: Splitting the String ---
# We want to separate the log entry into parts (e.g., timestamp, level, message).
# The .split() method is ideal for this.
# Let's split by the colon (':').
parts = cleaned_log_entry.split(':')
print("--- Splitting the String ---")
print(f"Split parts: {parts}")
# The third part contains the message, but it has extra space.
log_message = parts[2].strip()
print(f"Extracted Log Message: '{log_message}'\n")

# --- Part 5: Replacing and Joining ---
# Let's say we want to anonymize the user ID and standardize the string.
# The .replace() method helps us replace a substring.
anonymous_message = log_message.replace('User 123', 'ANONYMOUS_USER')
print("--- Replacing a Substring ---")
print(f"Anonymized Message: '{anonymous_message}'\n")

# Now, let's say we need to join the parts back together with a different separator.
# The .join() method is used for this.
# It takes an iterable (like a list) and concatenates its elements into a string.
new_separator = " | "
reconstructed_string = new_separator.join(parts)
print("--- Joining Strings ---")
print(f"Reconstructed String: '{reconstructed_string}'")


--- Cleaning the Raw String ---
Raw: '  [2023-10-26 08:30:15] INFO:  'User 123' processed 10 records.   '
Cleaned: '[2023-10-26 08:30:15] INFO:  'User 123' processed 10 records.'

--- Finding & Slicing ---
Extracted Log Level: 'INFO:'

--- Splitting the String ---
Split parts: ['[2023-10-26 08', '30', '15] INFO', "  'User 123' processed 10 records."]
Extracted Log Message: '15] INFO'

--- Replacing a Substring ---
Anonymized Message: '15] INFO'

--- Joining Strings ---
Reconstructed String: '[2023-10-26 08 | 30 | 15] INFO |   'User 123' processed 10 records.'


# Telecom Data Analysis

In [20]:
# --- User Story ---
# As a data engineer for a telecom company, I've received a batch of call records
# from the past hour. My task is to process this raw data to provide a quick
# summary for the business. This summary should include the total number of calls,
# the number of failed calls, and a list of the top callers.

# This project demonstrates the use of variables, data types, lists, dictionaries,
# loops, conditionals, functions, and sets.

# --- Part 1: Simulate Raw Data ---
# In a real-world scenario, this data would come from a database or a file.
# For this project, we'll simulate a list of dictionaries.
raw_call_records = [
    {
        'call_id': 'c101',
        'from': '555-1234',
        'to': '555-5678',
        'duration_seconds': 120,
        'status': 'success'
    },
    {
        'call_id': 'c102',
        'from': '555-1234',
        'to': '555-8888',
        'duration_seconds': 45,
        'status': 'success'
    },
    {
        'call_id': 'c103',
        'from': '555-9999',
        'to': '555-1111',
        'duration_seconds': 0,
        'status': 'failed'
    },
    {
        'call_id': 'c104',
        'from': '555-1234',
        'to': '555-2222',
        'duration_seconds': 90,
        'status': 'success'
    },
    {
        'call_id': 'c105',
        'from': '555-9999',
        'to': '555-4444',
        'duration_seconds': 30,
        'status': 'success'
    },
    {
        'call_id': 'c106',
        'from': '555-3333',
        'to': '555-5555',
        'duration_seconds': 0,
        'status': 'failed'
    }
]

# --- Part 2: Process the Data with a Function ---
# Encapsulating our logic in a function makes our code reusable.
def analyze_call_records(records):
    """
    Analyzes a list of call records to generate a summary report.
    """
    # Initialize variables to hold our metrics.
    total_calls = 0
    success_calls = 0
    failed_calls = 0
    total_duration = 0

    # Use a dictionary to count calls from each number.
    caller_counts = {}

    # We will use a set to find the number of unique callers.
    unique_callers = set()

    # Use a for loop to iterate over each record in the list.
    for call in records:
        total_calls += 1

        # Use conditional statements to check the status.
        if call['status'] == 'success':
            success_calls += 1
            total_duration += call['duration_seconds']
        elif call['status'] == 'failed':
            failed_calls += 1

        # Update the caller counts using a dictionary.
        caller = call['from']
        if caller in caller_counts:
            caller_counts[caller] += 1
        else:
            caller_counts[caller] = 1

        # Add the caller to the set of unique callers.
        unique_callers.add(caller)

    # Use a dictionary to structure our final report.
    report = {
        'total_calls': total_calls,
        'success_rate_percent': (success_calls / total_calls) * 100 if total_calls > 0 else 0,
        'failed_calls': failed_calls,
        'total_duration_minutes': total_duration / 60,
        'unique_callers': len(unique_callers),
        'caller_breakdown': caller_counts
    }

    return report

# --- Part 3: Run the Analysis and Print the Report ---
call_report = analyze_call_records(raw_call_records)

print("--- Telecom Call Analysis Report ---")
print(f"Total Calls Processed: {call_report['total_calls']}")
print(f"Total Failed Calls: {call_report['failed_calls']}")
print(f"Success Rate: {call_report['success_rate_percent']:.2f}%")
print(f"Total Talk Time: {call_report['total_duration_minutes']:.2f} minutes")
print(f"Number of Unique Callers: {call_report['unique_callers']}")

print("\n--- Caller Breakdown ---")
# Use a for loop to iterate over the items in the caller_breakdown dictionary.
for caller, count in call_report['caller_breakdown'].items():
    print(f"  Caller {caller}: {count} calls")


--- Telecom Call Analysis Report ---
Total Calls Processed: 6
Total Failed Calls: 2
Success Rate: 66.67%
Total Talk Time: 4.75 minutes
Number of Unique Callers: 3

--- Caller Breakdown ---
  Caller 555-1234: 3 calls
  Caller 555-9999: 2 calls
  Caller 555-3333: 1 calls


#Day 9: Lists & List Operations

In [14]:
# User Story: As a data engineer, I need to process a batch of customer orders.
# I need to add new orders, remove invalid ones, and perform quick checks on the
# data before it's moved to the next stage of the pipeline.

# --- Part 1: Initializing and Modifying Lists ---
# Lists are ordered, mutable collections of items. You can change them at
# any time, which makes them highly versatile.

print("--- Initializing and Modifying Lists ---")
# Example: A list of customer orders to be processed.
# Each order is represented by its unique ID.
customer_orders = ["ORD_101", "ORD_102", "ORD_103"]
print(f"Original list of customer orders: {customer_orders}")

# We received a new order. Let's add it to the list.
customer_orders.append("ORD_104")
print(f"List after appending a new order: {customer_orders}")

# A customer canceled an order. Let's remove it.
customer_orders.remove("ORD_102")
print(f"List after removing a canceled order: {customer_orders}")

# A special case: sometimes you need to add multiple items at once.
new_orders = ["ORD_105", "ORD_106"]
customer_orders.extend(new_orders)
print(f"List after extending with a new batch of orders: {customer_orders}")

# --- Part 2: Accessing and Slicing Lists ---
# Accessing elements is straightforward with indexing, and slicing
# allows you to work with a subset of your data.

print("\n--- Accessing and Slicing Lists ---")
# Access the first order in the list.
first_order = customer_orders[0]
print(f"The first order to be processed is: {first_order}")

# Access the last order using a negative index.
last_order = customer_orders[-1]
print(f"The last order is: {last_order}")

# Use slicing to get a range of orders. This is useful for
# processing data in smaller chunks.
processed_batch = customer_orders[1:4]  # Gets items from index 1 up to (but not including) 4
print(f"A batch of orders for processing: {processed_batch}")

# --- Part 3: List Functions & Other Operations ---
# There are many built-in functions and methods that are essential for
# common data engineering tasks.

print("\n--- List Functions and Operations ---")
# Get the total number of orders in the list.
print(f"Total number of orders to process: {len(customer_orders)}")

# Check if a specific order exists in the list.
is_order_103_present = "ORD_103" in customer_orders
print(f"Is order 'ORD_103' in the list? {is_order_103_present}")

# Find the index of an item.
try:
    index_of_order_104 = customer_orders.index("ORD_104")
    print(f"Order 'ORD_104' is located at index: {index_of_order_104}")
except ValueError:
    print("Order 'ORD_104' was not found.")


--- Initializing and Modifying Lists ---
Original list of customer orders: ['ORD_101', 'ORD_102', 'ORD_103']
List after appending a new order: ['ORD_101', 'ORD_102', 'ORD_103', 'ORD_104']
List after removing a canceled order: ['ORD_101', 'ORD_103', 'ORD_104']
List after extending with a new batch of orders: ['ORD_101', 'ORD_103', 'ORD_104', 'ORD_105', 'ORD_106']

--- Accessing and Slicing Lists ---
The first order to be processed is: ORD_101
The last order is: ORD_106
A batch of orders for processing: ['ORD_103', 'ORD_104', 'ORD_105']

--- List Functions and Operations ---
Total number of orders to process: 5
Is order 'ORD_103' in the list? True
Order 'ORD_104' is located at index: 2


#Day 10: Dictionaries & Dictionary Methods.

In [15]:
# User Story: As a data engineer, I've received a batch of log entries.
# I need to parse each log entry to extract specific information like the
# event type, server ID, and timestamp and store it in a structured format.

# --- Part 1: Creating and Accessing Dictionaries ---
# Dictionaries store data in key-value pairs. Keys must be unique and
# immutable (like strings, numbers, or tuples).

print("--- Creating and Accessing Dictionaries ---")
# Example: A dictionary representing a single log entry.
log_entry = {
    "timestamp": "2024-05-18T12:00:00Z",
    "event_type": "Data_Ingestion_Start",
    "server_id": "server-101",
    "status": "SUCCESS"
}

# Access a value using its key.
print(f"The event type is: {log_entry['event_type']}")
print(f"The timestamp is: {log_entry['timestamp']}")

# You can also add new key-value pairs to the dictionary.
log_entry["run_duration_ms"] = 1500
print(f"Log entry with new data: {log_entry}")

# --- Part 2: Dictionary Methods ---
# Dictionaries have useful methods for interacting with their keys and values.

print("\n--- Exploring Dictionary Methods ---")
# Get a list of all keys using the .keys() method.
keys = log_entry.keys()
print(f"All keys in the log entry: {list(keys)}")

# Get a list of all values using the .values() method.
values = log_entry.values()
print(f"All values in the log entry: {list(values)}")

# Check if a key exists in the dictionary using the 'in' operator.
# This is a critical check for preventing errors.
is_run_duration_present = "run_duration_ms" in log_entry
print(f"Is 'run_duration_ms' key present? {is_run_duration_present}")

is_user_id_present = "user_id" in log_entry
print(f"Is 'user_id' key present? {is_user_id_present}")

# The .get() method is a safe way to access a value. It returns None if
# the key is not found, preventing a KeyError.
status = log_entry.get("status")
print(f"The status is: {status}")

user_id = log_entry.get("user_id", "NOT_FOUND")
print(f"The user ID is: {user_id}")

# --- Part 3: Deleting Data ---
# You can remove key-value pairs from a dictionary using del.
print("\n--- Deleting Dictionary Items ---")
print(f"Original dictionary before deletion: {log_entry}")
del log_entry["status"]
print(f"Dictionary after deleting 'status': {log_entry}")


--- Creating and Accessing Dictionaries ---
The event type is: Data_Ingestion_Start
The timestamp is: 2024-05-18T12:00:00Z
Log entry with new data: {'timestamp': '2024-05-18T12:00:00Z', 'event_type': 'Data_Ingestion_Start', 'server_id': 'server-101', 'status': 'SUCCESS', 'run_duration_ms': 1500}

--- Exploring Dictionary Methods ---
All keys in the log entry: ['timestamp', 'event_type', 'server_id', 'status', 'run_duration_ms']
All values in the log entry: ['2024-05-18T12:00:00Z', 'Data_Ingestion_Start', 'server-101', 'SUCCESS', 1500]
Is 'run_duration_ms' key present? True
Is 'user_id' key present? False
The status is: SUCCESS
The user ID is: NOT_FOUND

--- Deleting Dictionary Items ---
Original dictionary before deletion: {'timestamp': '2024-05-18T12:00:00Z', 'event_type': 'Data_Ingestion_Start', 'server_id': 'server-101', 'status': 'SUCCESS', 'run_duration_ms': 1500}
Dictionary after deleting 'status': {'timestamp': '2024-05-18T12:00:00Z', 'event_type': 'Data_Ingestion_Start', 'serve