In [1]:
import tarfile
import gzip
import shutil
import os
from tqdm import tqdm
import pandas as pd
from datetime import datetime
from io import BytesIO
import gdown

In [2]:
SAMPLE = 100

def extract_csv_from_drive(link, idx):
    file_path = f"data_{idx}.tar.gz"
    gdown.download(link, file_path, quiet=False, fuzzy=True)
    
    with tarfile.open(file_path, "r:gz") as tar:
        for member in tqdm(tar.getmembers()):
            if member.name.endswith(".csv.gz"): 
                safe_name = member.name.replace(":", "_")
                member.name = os.path.basename(safe_name)
                tar.extract(member, f"extracted_data/data_{idx}")

    for file in tqdm(os.listdir(f"extracted_data/data_{idx}")):
        if file.endswith(".csv.gz"):
            gz_file_path = os.path.join(f"extracted_data/data_{idx}", file)
            csv_file_path = os.path.join(f"final_csv_files/data_{idx}", file.replace(".gz", ""))
            with gzip.open(gz_file_path, "rb") as gz_file, open(csv_file_path, "wb") as csv_file:
                shutil.copyfileobj(gz_file, csv_file)

In [None]:
# drive_links = ["https://drive.google.com/file/d/1UrIllwm2FIt-AEIPPPf0jS_9UU8pnNK6/view?usp=sharing", "https://drive.google.com/file/d/12OHCjEK0E__7PUsfhkCNtLJHIjt_PLLg/view?usp=sharing", "https://drive.google.com/file/d/1D-iPGMbbra6VRZ-qpBChd98TLdxdtRih/view?usp=sharing", "https://drive.google.com/file/d/1QwRH6U6f0U_fwb4FSeQ7cD5rfE_D1fh4/view?usp=sharing"]
drive_links = ["https://drive.google.com/file/d/1qleN-t1xdRy0xwDiK6A-eRxjL4sZZMoo/view", "https://drive.google.com/file/d/13aenLqZxxGYC45vjV4poqI5ReYqx20R_/view"]
download_folder = "downloaded_data"
os.makedirs(download_folder, exist_ok=True)

In [7]:
os.makedirs("extracted_data", exist_ok=True)
os.makedirs("final_csv_files", exist_ok=True)
idx = 5
for link in drive_links:
    os.makedirs(f"extracted_data/data_{idx}", exist_ok=True)
    os.makedirs(f"final_csv_files/data_{idx}", exist_ok=True)
    extract_csv_from_drive(link, idx)

Downloading...
From (original): https://drive.google.com/uc?id=1qleN-t1xdRy0xwDiK6A-eRxjL4sZZMoo
From (redirected): https://drive.google.com/uc?id=1qleN-t1xdRy0xwDiK6A-eRxjL4sZZMoo&confirm=t&uuid=9df99ce2-368e-4359-be7e-40b00764b3ed
To: c:\Users\piyus\Downloads\PowderData\Working Directory\data_5.tar.gz
100%|██████████| 635M/635M [00:35<00:00, 17.7MB/s] 
  tar.extract(member, f"extracted_data/data_{idx}")
100%|██████████| 1076/1076 [00:05<00:00, 179.72it/s]
100%|██████████| 1076/1076 [00:29<00:00, 36.25it/s]


In [None]:
def get_time_of_day(hour):
    if 6 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 18:
        return "Afternoon"
    elif 18 <= hour < 24:
        return "Evening"
    else:
        return "Night"

In [None]:
dataframes = []

for idx, link in enumerate(drive_links):
    tar_file_path = os.path.join(download_folder, f"data_{idx}.tar.gz")
    gdown.download(link, tar_file_path, quiet=False)
    

    with tarfile.open(tar_file_path, "r:gz") as tar:
        tar.extractall(download_folder)

    extracted_folder = os.path.join(download_folder, f"data_{idx}")
    for root, _, files in os.walk(extracted_folder):
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path)
                timestamp = int(file.split('-')[1].split('.')[0])
                local_dt = datetime.fromtimestamp(timestamp)
                day = local_dt.strftime('%Y-%m-%d')
                time_of_day = get_time_of_day(local_dt.hour)
                df["Location_ID"] = idx
                df["Day"] = day
                df["Time_of_Day"] = time_of_day
                dataframes.append(df)

main_df = pd.concat(dataframes, ignore_index=True)