<a href="https://colab.research.google.com/github/newGitHub1234/Project_1_ETL/blob/main/Project_1_ETL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import glob
import pandas as pd
import xml.etree.ElementTree as ET
import datetime
import csv
import json
import os
import logging
import pytz

# Getting the IST timezone

ist_timezone = pytz.timezone('Asia/Kolkata')

# Converting current time to IST

current_time_ist = datetime.datetime.now(ist_timezone)

# Format the time with IST

formatted_time_ist = current_time_ist.strftime('%Y-%m-%d %H:%M:%S.%f %Z%z')

# Main folder path

folder_path = '/content/Project_ETL'

# Configuring logging

logging.basicConfig(filename='/content/log_file.txt', level=logging.INFO,
                    format=f'{formatted_time_ist} - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S', force=True)

# Function to extract data from CSV file

def extract_csv(file_path):
    logging.info(f"Extraction - Starting to process CSV file: {file_path}")
    data = []
    with open(file_path, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            data.append(row)
    logging.info(f"Extraction - Finished processing CSV file: {file_path}")
    return pd.DataFrame(data)

# Function to extract data from JSON file

def extract_json(file_path):
    logging.info(f"Extraction - Starting to process JSON file: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    logging.info(f"Extraction - Finished processing JSON file: {file_path}")
    #print("json data", data)
    return pd.DataFrame(data)

# Function to extract data from XML file

def extract_xml(file_path):
    logging.info(f"Extraction - Starting to process XML file: {file_path}")
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []
    for elem in root:
        data.append({child.tag: child.text for child in elem})
    logging.info(f"Extraction - Finished processing XML file: {file_path}")
    return pd.DataFrame(data)

# Function to Process the files with extensions and Transform extracted Data

def process_files(folder_path, filename):

    file_path = os.path.join(folder_path, filename)
    file_extension = filename.split('.')[-1].lower()

    try:
        if file_extension == 'csv':
            print(f"Processing CSV file: {filename}")
            data = extract_csv(file_path)
        elif file_extension == 'json':
            print(f"Processing JSON file: {filename}")
            data = extract_json(file_path)
        elif file_extension == 'xml':
            print(f"Processing XML file: {filename}")
            data = extract_xml(file_path)
        else:
            print(f"Unsupported file type: {filename}")
            return pd.DataFrame()

        return data

    except (pd.errors.EmptyDataError, json.JSONDecodeError, ET.ParseError) as e:
        print(f"Warning: Error processing file '{filename}': {e}")
        return pd.DataFrame()


# Function to process the files and combine data

def process_all_files_in_folder(folder_path):
    logging.info(f"Processing - Starting to process all files in folder: {folder_path}")

    all_data_frames = []

    file_paths = glob.glob(os.path.join(folder_path, '*'))

    for file_path in file_paths:
        filename = os.path.basename(file_path)
        df = process_files(folder_path, filename)
        if not df.empty:
            all_data_frames.append(df)

    if all_data_frames:
        combined_data = pd.concat(all_data_frames, ignore_index=True)
    else:
        print("Warning: No valid data files found in the folder.")
        combined_data = pd.DataFrame()

    logging.info(f"Processing - Finished processing all files in folder: {folder_path}")
    return combined_data

# Processing all the files in the folder

combined_df = process_all_files_in_folder(folder_path)

# Converting combined_df to pandas dataframe

df = pd.DataFrame(combined_df)

# Conversion function

def convert_units(df):

    # Convert height from inches to meters

    df['Height_meters'] = pd.to_numeric(df['height']) * 0.0254

    # Convert weight from pounds to kilograms

    df['Weight_kg'] = pd.to_numeric(df['weight']) * 0.453592

    return df

# Converting the units

df_converted = convert_units(df)

# Saving the final output, combined data Dataframe to a CSV file

output_file_path = '/content/transformed_data.csv'
df_converted.to_csv(output_file_path, index=False)
logging.info(f"Loading - Transformed data saved to: {output_file_path}")


Processing JSON file: source1.json
Processing XML file: source2.xml
Processing CSV file: transformed_data.csv
Unsupported file type: log_file.txt
Processing CSV file: source2.csv
Processing JSON file: source2.json
Processing XML file: source1.xml
Processing XML file: source3.xml
Processing CSV file: source3.csv
Processing CSV file: source1.csv
Processing JSON file: source3.json
