In [44]:
import gdown
import os

def download_from_gdrive(share_url: str) -> str:
    file_id = share_url.split("/d/")[1].split("/")[0]
    direct_url = f"https://drive.google.com/uc?id={file_id}"
    os.makedirs("data_dumper", exist_ok=True)
    downloaded_path = gdown.download(url=direct_url, output=None, quiet=False)
    if downloaded_path:
        new_path = os.path.join("data_dumper", os.path.basename(downloaded_path))
        os.rename(downloaded_path, new_path)
        return new_path
    return ""


In [45]:
import os
import requests

def download_from_url(url: str) -> str:
    filename = url.split("/")[-1]
    save_path = os.path.join("data_dumper", filename)
    
    response = requests.get(url)
    with open(save_path, "wb") as f:
        f.write(response.content)
    
    return save_path


In [68]:
import pandas as pd
import os

def load_all_data_to_df(github_urls, gdrive_links,system_file):
    os.makedirs("data_dumper", exist_ok=True)

    columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 
               'restecg', 'thalach', 'exang', 'oldpeak', 
               'slope', 'ca', 'thal', 'target']
    
    all_dfs = []

    # Process GitHub files
    for url in github_urls:
        file_path = download_from_url(url)  # assumes this function returns a local file path
        df = pd.read_csv(file_path, sep=",", names=columns)
        all_dfs.append(df)
        os.remove(file_path)

    # Process Google Drive files
    for link in gdrive_links:
        file_path = download_from_gdrive(link)  # assumes this function returns a local file path
        df = pd.read_csv(file_path, sep=",", names=columns)
        all_dfs.append(df)
        os.remove(file_path)
    for file in system_file:
        df = pd.read_csv(file, sep=",", names=columns)
        all_dfs.append(df)

    # Combine all dataframes into one
    df_all = pd.concat(all_dfs, ignore_index=True)
    return df_all


In [73]:
github_urls = ["https://github.com/anastasiatraverse/Heart-Disease-Prediction/raw/master/preprocessing%20UCI/processed.cleveland.data"]
gdrive_links = ["https://drive.google.com/file/d/1IecRQAFiVpYaBtf4vptaPJGRR7QG_efL/view?usp=sharing","https://drive.google.com/file/d/1m0IYCxR0Cn4NeVmDJmbQKwkLn4k7uW1U/view?usp=drive_link"]
system_file = ["system_data\processed.hungarian.data"]
df = load_all_data_to_df(github_urls,gdrive_links,system_file)


Downloading...
From: https://drive.google.com/uc?id=1IecRQAFiVpYaBtf4vptaPJGRR7QG_efL
To: d:\heart_disease\processed.va.data
100%|██████████| 6.94k/6.94k [00:00<00:00, 781kB/s]
Downloading...
From: https://drive.google.com/uc?id=1m0IYCxR0Cn4NeVmDJmbQKwkLn4k7uW1U
To: d:\heart_disease\processed.switzerland.data
100%|██████████| 4.23k/4.23k [00:00<00:00, 4.15MB/s]


In [87]:
import psycopg2

conn = psycopg2.connect(
    host="localhost",
    port="5432",
    database="heart_dwh",
    user="postgres",
    password="A1s2d3f4"
)
cursor = conn.cursor()

# Insert each row of df_all into test_data
for _, row in df.iterrows():
    cursor.execute("""
        INSERT INTO train_data (
            age, sex, cp, trestbps, chol, fbs,
            restecg, thalach, exang, oldpeak,
            slope, ca, thal, target
        ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """, tuple(row))

# Commit the transaction
conn.commit()

# Close the connection
cursor.close()
conn.close()



✅ Data inserted into test_data successfully!
