In [1]:
# generate sample data
import pandas as pd
import random


# generate sample data
data = {
    'roll_num': range(1, 1000001),
    'name': [f'student {i}' for i in range(1, 1000001)],
    'sub1': [round(random.uniform(600, 1000), 2) for _ in range(1000000)],
    'sub2': [round(random.uniform(600, 1000), 2) for _ in range(1000000)],
    'sub3': [round(random.uniform(600, 1000), 2) for _ in range(1000000)],
    'sub4': [round(random.uniform(600, 1000), 2) for _ in range(1000000)],
    'sub5': [round(random.uniform(600, 1000), 2) for _ in range(1000000)],
    'sub6': [round(random.uniform(600, 1000), 2) for _ in range(1000000)],
    'sub7': [round(random.uniform(600, 1000), 2) for _ in range(1000000)]
}


In [2]:
# Create a dataframe
df = pd.DataFrame(data)

In [3]:
# Save to CSV
df.to_csv('students.csv', index=False)
print("Sample CSV file 'students.csv' created!")

Sample CSV file 'students.csv' created!


In [4]:
# ETL pipeline implementation

import duckdb
import pandas as pd
import time

# Extract: load time to CSV
start_time = time.time()
df = pd.read_csv('students.csv')
extract_time = time.time() - start_time
print(f"Data extracted in {extract_time:.2f} seconds.")

Data extracted in 0.42 seconds.


In [5]:
df.head()

Unnamed: 0,roll_num,name,sub1,sub2,sub3,sub4
0,1,student 1,992.89,897.64,964.23,881.64
1,2,student 2,978.02,802.29,621.58,629.92
2,3,student 3,958.62,939.72,895.76,723.03
3,4,student 4,638.21,856.87,611.92,834.12
4,5,student 5,641.62,612.04,659.39,829.27


In [6]:
# Connect to duck DB
conn = duckdb.connect(database=':memory:')

In [7]:
# Transform: clean and aggregate data
start_time = time.time()
df['average_score'] = df [['sub1', 'sub2', 'sub3', 'sub4', 'sub5', 'sub6', 'sub7']].mean(axis=1)
transform_time = time.time() - start_time
print(f"Data transformed in {transform_time:.2f} seconds.")

Data transformed in 0.08 seconds.


In [8]:
# load data into DuckDB table
start_time = time.time()
conn.execute("CREATE TABLE students AS SELECT * FROM df")
load_time = time.time() - start_time
print(f"Data loaded into DuckDB in {load_time:.2f} seconds.")

Data loaded into DuckDB in 0.14 seconds.


In [9]:
# performance stats
total_time = extract_time + transform_time + load_time
print(f"Total ETL time: {total_time:.2f} seconds.")

Total ETL time: 0.64 seconds.


In [10]:
# Analyze: Querying the data
start_time = time.time()
result = conn.execute("SELECT * FROM students WHERE average_score > 850").fetchall()
query_time = time.time() - start_time
print(f"Data queried in {query_time:.2f} seconds")

Data queried in 0.19 seconds


In [11]:
# Output results
print("Query Result:")
for row in result[:10]:  # Display first 10 results for brevity
    print(row)

Query Result:
(1, 'student 1', 992.89, 897.64, 964.23, 881.64, 934.1)
(3, 'student 3', 958.62, 939.72, 895.76, 723.03, 879.2825)
(8, 'student 8', 976.13, 801.04, 794.07, 833.61, 851.2125000000001)
(14, 'student 14', 931.81, 749.59, 833.81, 936.23, 862.86)
(19, 'student 19', 998.72, 916.76, 822.72, 896.08, 908.5699999999999)
(21, 'student 21', 860.06, 720.85, 912.66, 993.71, 871.8199999999999)
(22, 'student 22', 886.68, 887.91, 974.11, 971.2, 929.9749999999999)
(27, 'student 27', 946.24, 971.62, 919.18, 961.74, 949.6949999999999)
(34, 'student 34', 878.94, 848.41, 762.31, 944.7, 858.5899999999999)
(42, 'student 42', 685.55, 863.44, 931.17, 920.71, 850.2175)


In [12]:
# Summary of performance stats
performance_stats = {
    'Extract Time': extract_time,
    'Transform Time': transform_time,
    'Load Time': load_time,
    'Query Time': query_time,
    'Total ETL Time': total_time
}


In [13]:
print("\nPerformance Stats:")
for key, value in performance_stats.items():
    print(f"{key}: {value:.2f} seconds")


Performance Stats:
Extract Time: 0.42 seconds
Transform Time: 0.08 seconds
Load Time: 0.14 seconds
Query Time: 0.19 seconds
Total ETL Time: 0.64 seconds
