In [1]:
import pandas as pd
import sqlite3
import os

In [7]:
# File paths
user_path = os.path.join('..', 'data', 'USER_TAKEHOME.csv')
transaction_path = os.path.join('..', 'data', 'TRANSACTION_TAKEHOME.csv')
product_path = os.path.join('..', 'data', 'PRODUCTS_TAKEHOME.csv')

In [11]:
# Output DB path 
db_path = os.path.join('takehome.db')

In [13]:
# Load CSV files
df_user = pd.read_csv(user_path)
df_transaction = pd.read_csv(transaction_path)
df_product = pd.read_csv(product_path)


In [23]:
# Clean User Data
df_user['CREATED_DATE'] = pd.to_datetime(df_user['CREATED_DATE'], errors='coerce').dt.date
df_user['BIRTH_DATE'] = pd.to_datetime(df_user['BIRTH_DATE'], errors='coerce').dt.date
df_user.drop_duplicates(inplace=True)

In [25]:
df_user

Unnamed: 0,ID,CREATED_DATE,BIRTH_DATE,STATE,LANGUAGE,GENDER
0,5ef3b4f17053ab141787697d,2020-06-24,2000-08-11,CA,es-419,female
1,5ff220d383fcfc12622b96bc,2021-01-03,2001-09-24,PA,en,female
2,6477950aa55bb77a0e27ee10,2023-05-31,1994-10-28,FL,es-419,female
3,658a306e99b40f103b63ccf8,2023-12-26,NaT,NC,en,
4,653cf5d6a225ea102b7ecdc2,2023-10-28,1972-03-19,PA,en,female
...,...,...,...,...,...,...
99995,61fc06d41febf771966da8fa,2022-02-03,1992-03-16,CA,en,female
99996,6391e7ef90ad5449ec5f782d,2022-12-08,1993-09-23,MO,en,female
99997,637d5efdd6f2a49c49934dcb,2022-11-22,1983-04-19,RI,en,female
99998,5f0de23b05d8a6147dc0cafa,2020-07-14,1995-06-09,DE,en,female


In [27]:
# Clean Transactions Data
df_transaction.drop_duplicates(inplace=True)
df_transaction['PURCHASE_DATE'] = pd.to_datetime(df_transaction['PURCHASE_DATE'], errors='coerce').dt.date
df_transaction['SCAN_DATE'] = pd.to_datetime(df_transaction['SCAN_DATE'], errors='coerce').dt.date
df_transaction['BARCODE'] = df_transaction['BARCODE'].astype(str)

In [29]:
df_transaction

Unnamed: 0,RECEIPT_ID,PURCHASE_DATE,SCAN_DATE,STORE_NAME,USER_ID,BARCODE,FINAL_QUANTITY,FINAL_SALE
0,0000d256-4041-4a3e-adc4-5623fb6e0c99,2024-08-21,2024-08-21,WALMART,63b73a7f3d310dceeabd4758,15300014978.0,1.00,
1,0001455d-7a92-4a7b-a1d2-c747af1c8fd3,2024-07-20,2024-07-20,ALDI,62c08877baa38d1a1f6c211a,,zero,1.49
2,00017e0a-7851-42fb-bfab-0baa96e23586,2024-08-18,2024-08-19,WALMART,60842f207ac8b7729e472020,78742229751.0,1.00,
3,000239aa-3478-453d-801e-66a82e39c8af,2024-06-18,2024-06-19,FOOD LION,63fcd7cea4f8442c3386b589,783399746536.0,zero,3.49
4,00026b4c-dfe8-49dd-b026-4c2f0fd5c6a1,2024-07-04,2024-07-05,RANDALLS,6193231ae9b3d75037b0f928,47900501183.0,1.00,
...,...,...,...,...,...,...,...,...
49995,b5cd61a9-8033-4913-a5c4-fb3f65e3a321,2024-08-21,2024-08-31,TARGET,6154bcf098f885648de2f299,85239110669.0,2.00,1.18
49996,e1b2f634-c9ad-4152-b662-4b22efc25862,2024-08-11,2024-08-11,STOP & SHOP,60aa809f188b926b2244c974,46100400555.0,1.00,2.00
49997,b07ef8dd-e444-40a2-819b-f74a3e5f1ae7,2024-07-11,2024-07-11,WALMART,60bd26e83dc3b13a15c5f4e7,646630019670.0,1.00,20.96
49998,42475141-bef4-4df2-aa37-72577e2512bb,2024-06-18,2024-06-18,MARKET BASKET,6169912fac47744405af62b7,41800501519.0,1.00,3.00


In [31]:
# Clean Product Data
df_product.drop_duplicates(inplace=True)
df_product['BARCODE'] = df_product['BARCODE'].astype(str)

In [33]:
df_product

Unnamed: 0,CATEGORY_1,CATEGORY_2,CATEGORY_3,CATEGORY_4,MANUFACTURER,BRAND,BARCODE
0,Health & Wellness,Sexual Health,Conductivity Gels & Lotions,,,,796494407820.0
1,Snacks,Puffed Snacks,Cheese Curls & Puffs,,,,23278011028.0
2,Health & Wellness,Hair Care,Hair Care Accessories,,PLACEHOLDER MANUFACTURER,ELECSOP,461817824225.0
3,Health & Wellness,Oral Care,Toothpaste,,COLGATE-PALMOLIVE,COLGATE,35000466815.0
4,Health & Wellness,Medicines & Treatments,Essential Oils,,MAPLE HOLISTICS AND HONEYDEW PRODUCTS INTERCHA...,MAPLE HOLISTICS,806810850459.0
...,...,...,...,...,...,...,...
845547,Health & Wellness,Topical Muscle & Joint Relief Treatments,Braces & Wraps,,,,722301569399.0
845548,Snacks,Cookies,,,"TREEHOUSE FOODS, INC.",LOFTHOUSE,41820818468.0
845549,Snacks,Candy,Confection Candy,,HARIBO GMBH & CO KG,HARIBO,100167154940.0
845550,Snacks,Nuts & Seeds,Hazelnuts,,DOUBLE-COLA CO,JUMBO,75390755960.0


In [35]:
# Create SQLite connection and export tables
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [37]:
df_user.to_sql('user', conn, if_exists='replace', index=False)
df_transaction.to_sql('transaction', conn, if_exists='replace', index=False)
df_product.to_sql('product', conn, if_exists='replace', index=False)
print(f"✅ Tables created successfully in {db_path}")

✅ Tables created successfully in takehome.db
