In [5]:
# Import necessary libraries
import pandas as pd
import sqlite3
import os

# Step 1: Extract Data (From CSV)
def extract_data(csv_file_path):
    """
    Function to extract data from a CSV file.
    """
    if os.path.exists(csv_file_path):
        data = pd.read_csv(csv_file_path)
        print(f"Data extracted successfully from {csv_file_path}")
        return data
    else:
        raise FileNotFoundError(f"The file {csv_file_path} does not exist.")

# Step 2: Transform Data (Cleaning and Processing)
def transform_data(data):
    """
    Function to transform data - Cleaning and Processing
    """
    # Example transformation: Handling missing values
    print("Starting transformation process...")
    
    # Remove rows with missing data
    data_cleaned = data.dropna()
    
    # Example transformation: Convert column names to lowercase
    data_cleaned.columns = data_cleaned.columns.str.lower()

    # Example transformation: Remove leading/trailing whitespace from string fields
    # Using apply() on object-type columns (string fields)
    data_cleaned = data_cleaned.apply(lambda col: col.str.strip() if col.dtypes == 'object' else col)
    
    print("Data transformation complete.")
    return data_cleaned

# Step 3: Load Data (Into SQLite Database)
def load_data(data, db_name, table_name):
    """
    Function to load data into an SQLite database.
    """
    # Connect to SQLite database (it will create a new database if it doesn't exist)
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    print(f"Loading data into {table_name} table in the {db_name} database...")
    
    # Load the cleaned data into the database
    data.to_sql(table_name, conn, if_exists='replace', index=False)
    
    # Commit the transaction and close the connection
    conn.commit()
    conn.close()
    
    print(f"Data loaded successfully into {db_name} database, table: {table_name}")

# Step 4: ETL Pipeline Function
def etl_pipeline(csv_file_path, db_name, table_name):
    """
    Function to execute the ETL pipeline.
    """
    # Extract
    data = extract_data(csv_file_path)
    
    # Transform
    cleaned_data = transform_data(data)
    
    # Load
    load_data(cleaned_data, db_name, table_name)

# Create a sample CSV file for testing
sample_data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 22, 35, 32],
    'City': ['New York', 'Paris', 'Berlin', 'London']
}

sample_df = pd.DataFrame(sample_data)
sample_csv_path = 'sample_data.csv'

# Save the DataFrame to a CSV file
sample_df.to_csv(sample_csv_path, index=False)

print(f"Sample CSV file created at: {sample_csv_path}")

# Run the ETL Pipeline
csv_file_path = sample_csv_path  # Path to the sample CSV file
db_name = "datawarehouse.db"     # SQLite database name (or change to preferred database)
table_name = "cleaned_data"      # Name of the table in the database

# Execute the ETL pipeline
etl_pipeline(csv_file_path, db_name, table_name)


Sample CSV file created at: sample_data.csv
Data extracted successfully from sample_data.csv
Starting transformation process...
Data transformation complete.
Loading data into cleaned_data table in the datawarehouse.db database...
Data loaded successfully into datawarehouse.db database, table: cleaned_data
