In [1]:
import glob #used for selecting files
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime

In [2]:
tmpfile = 'temp.tmp' #store all extracted data
logfile = 'logfile.txt' #store event logs
targetfile='transformed_data.csv' #store transformed data

# Extract

## CSV extract function

In [3]:
def extract_from_csv(file_to_process):
    dataframe= pd.read_csv(file_to_process)
    return dataframe

## JSON extract function

In [4]:
def extract_from_json(file_to_process):
    dataframe=pd.read_json(file_to_process, lines=True)
    return dataframe

## XML extract function

In [5]:
def extract_from_xml(file_to_process):
    dataframe=pd.DataFrame(columns=['name', 'height', 'weight'])
    tree=ET.parse(file_to_process)
    root=tree.getroot()
    for person in root:
        name=person.find('name').text
        height=float(person.find('height').text)
        weight=float(person.find('weight').text)
        dataframe=dataframe.append({'name': name, 'height': height, 'weight': weight}, ignore_index=True)
    return dataframe

# Total Extract function

In [6]:
def extract():
    extracted_data=pd.DataFrame(columns=['name','height','weight'])
    #extract all csv files:
    for csvfile in glob.glob('*.csv'):
        extracted_data=extracted_data.append(extract_from_csv(csvfile),ignore_index=True)
    
    #extract all json files:
    for jsonfile in glob.glob('*.json'):
        extracted_data=extracted_data.append(extract_from_json(jsonfile),ignore_index=True)
    
    #extract all xml files:
    for xmlfile in glob.glob('*.xml'):
        extracted_data=extracted_data.append(extract_from_xml(xmlfile),ignore_index=True)
    
    return extracted_data

# Transform
- Convert height from inches to milimeter
- Convert weight from pound to kilograms

In [7]:
def transform(data):
    data['height']=round(data.height*0.0254,2)
    data['weight']=round(data.weight*0.45359237,2)
    return data

# Load

In [8]:
def load(targetfile, data_to_load):
    data_to_load.to_csv(targetfile)

# Logging

In [9]:
def log(message):
    timestamp_format= '%Y-%m-%d - %H:%M:%S'
    now=datetime.now()
    timestamp=now.strftime(timestamp_format)
    with open('logfile.txt','a') as f:
        f.write(timestamp + ', ' + message + '\n')

# Running ETL process

In [10]:
log("ETL Job started")

In [11]:
log("Extract phase started")
extracted_data=extract()

In [12]:
log("Extract phase ended")
extracted_data

Unnamed: 0.1,name,height,weight,Unnamed: 0
0,alex,65.78,112.99,
1,ajay,71.52,136.49,
2,alice,69.40,153.03,
3,ravi,68.22,142.34,
4,joe,67.79,144.30,
...,...,...,...,...
73,ivan,67.62,114.14,
74,simon,67.90,112.37,
75,jacob,66.78,120.67,
76,cindy,66.49,127.45,


In [13]:
log("Transform phase started")
transformed_data=transform(extracted_data)
log("Transform phase ended")
transformed_data.head()

Unnamed: 0.1,name,height,weight,Unnamed: 0
0,alex,1.67,51.25,
1,ajay,1.82,61.91,
2,alice,1.76,69.41,
3,ravi,1.73,64.56,
4,joe,1.72,65.45,


In [14]:
log("Load phase started")
load(targetfile,transformed_data)
log("load phase ended")

In [15]:
log("ETL Job ended")