In [45]:
import pandas as pd
import glob
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from datetime import datetime
import os

import warnings
warnings.filterwarnings('ignore')

In [46]:
_path = './data/'
log_file = 'log_file.csv'
target_file = 'transformed_data.csv'

# Task 1 : Extraction

In [47]:
def extract_from_csv(filepath):
  df = pd.DataFrame(columns=['name', 'height', 'weight', 'from'])
  df = pd.concat([pd.read_csv(filepath), df], ignore_index=True)
  df['from'] = '.csv'

  return df

def extract_from_xml(filepath):
  df = pd.DataFrame(columns=['name', 'height', 'weight', 'from'])
  tree = ET.parse(filepath)
  root = tree.getroot()
  for person in root:
    _df = pd.DataFrame()
    name = person.find('name').text
    height = float(person.find('height').text)
    weight = float(person.find('weight').text)
    _from = '.xml'
    _df = pd.DataFrame([{'name': name, 'height': height, 'weight': weight, 'from': _from}])
    df = pd.concat([df,_df], ignore_index=True)

  return df

def extract_from_json(filepath):
  df = pd.DataFrame(columns=['name', 'height', 'weight', 'from'])
  df = pd.concat([pd.read_json(filepath), df], ignore_index=True)
  df['from'] = '.json'

  return df

def extract(datapath):
  extracted_data = pd.DataFrame(columns=['name', 'height', 'weight', 'from'])

  # Get csv files using glob
  for csvfilepath in glob.glob(os.path.join(_path, '*.csv')):
    _df = extract_from_csv(csvfilepath)
    extracted_data = pd.concat([_df, extracted_data], ignore_index=True)
  
  # Get json files using glob
  for jsonfilepath in glob.glob(os.path.join(_path, '*.json')):
    _df = extract_from_json(jsonfilepath)
    extracted_data = pd.concat([_df, extracted_data], ignore_index=True)

  # Get xml files using glob
  for xmlfilepath in glob.glob(os.path.join(_path, '*.xml')):
    _df = extract_from_xml(xmlfilepath)
    extracted_data = pd.concat([_df, extracted_data], ignore_index=True)

  return extracted_data


# Task 2 : Transformation

In [48]:
def transform(data):
  data = data.copy()

  ''' Remove duplicates ''' 
  data.drop_duplicates(inplace=True, ignore_index=True)

  ''' Standardize some measurement units '''
  data['height'] = round(data.height * 0.0254, 2)
  data['weight'] = round(data.weight * 0.45359237, 2)

  return data

# Task 3 : Load

In [49]:
def load(data, str: target_file):
  data.to_csv(target_file, index=False)

# Task 4 : Logging

In [50]:
def log_progress(message): 
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 
    with open(log_file,"a") as f: 
        f.write(timestamp + ',' + message + '\n') 

# Testing

In [55]:
# Log the initialization of the ETL process 
log_progress("ETL Job Started") 
 
# Log the beginning of the Extraction process 
log_progress("Extract phase Started") 
extracted_data = extract(_path) 
 
# Log the completion of the Extraction process 
log_progress("Extract phase Ended") 
 
# Log the beginning of the Transformation process 
log_progress("Transform phase Started") 
transformed_data = transform(extracted_data) 
print("Transformed Data") 
print(transformed_data) 
 
# Log the completion of the Transformation process 
log_progress("Transform phase Ended") 
 
# Log the beginning of the Loading process 
log_progress("Load phase Started") 
load(transformed_data, target_file) 
 
# Log the completion of the Loading process 
log_progress("Load phase Ended") 
 
# Log the completion of the ETL process 
log_progress("ETL Job Ended") 

Transformed Data
     name  height  weight   from
0   simon    1.72   50.97   .xml
1   jacob    1.70   54.73   .xml
2   cindy    1.69   57.81   .xml
3    ivan    1.72   51.77   .xml
4    jack    1.74   55.93  .json
5     tom    1.77   64.18  .json
6   tracy    1.78   61.90  .json
7    john    1.72   50.97  .json
8    alex    1.67   51.25   .csv
9    ajay    1.82   61.91   .csv
10  alice    1.76   69.41   .csv
11   ravi    1.73   64.56   .csv
12    joe    1.72   65.45   .csv
