In [None]:
import os 

os.chdir("..")

In [None]:
import polars as pl
import tempfile
from tqdm import tqdm
import requests
import geopandas as gpd
from src.data_pull import DataPull
from concurrent.futures import ThreadPoolExecutor, as_completed

dp = DataPull()

In [None]:
dp.pull_states_shapes()

In [None]:
gdf = dp.pull_county_shapes()
remove_list_sates = ["66", "69", "60", "09", "15", "69", "02"]
remove_list_counties = ["46102"]
gdf = gdf[~gdf["fips"].isin(remove_list_sates)]
gdf = gdf[~gdf["geo_id"].isin(remove_list_counties)]
county_list = list(gdf["geo_id"].values)
county_list

In [None]:
gdf.plot()

In [None]:
url_dict = {}
for county in county_list:
    for year in range(2014, 2025):
        for qtr in range(1, 5):
            url = f"http://data.bls.gov/cew/data/api/{year}/{qtr}/area/{county}.csv"
            file_path = f"data/test/us-qcew-{year}-{qtr}-{county}.parquet"
            if os.path.exists(file_path):
                continue
            url_dict[url] = file_path
len(url_dict)

In [None]:
139876

In [None]:
def pull_file(url: str, filename: str, verify: bool = True) -> None:
    chunk_size = 10 * 1024 * 1024

    with requests.get(url, stream=True, verify=verify) as response:
        total_size = int(response.headers.get("content-length", 0))

        with tqdm(
            total=total_size,
            unit="B",
            unit_scale=True,
            unit_divisor=1024,
            desc="Downloading",
        ) as bar:
            with open(filename, "wb") as file:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:
                        file.write(chunk)
                        bar.update(len(chunk))

In [None]:
def pull_qcew_file(url: str, filename: str, verify: bool = True) -> None:
    temp_filename = f"{tempfile.gettempdir()}/{hash(filename)}.csv"
    if not os.path.exists(temp_filename):
        print("no")
        pull_file(url=url, filename=filename)
    df = pl.read_csv(filename, ignore_errors=True)
    if len(df.columns) < 5:
        print(filename)
        raise ValueError("File Did not download correctly")
    df.write_parquet(filename)

In [None]:
filename = "data/test/us-qcew-2024-4-31039.parquet"
f"{tempfile.gettempdir()}/{hash(filename)}.csv"

In [None]:
pl.read_csv("/tmp/624073244870629600.csv")

In [None]:
pull_qcew_file(url="http://data.bls.gov/cew/data/api/2024/4/area/31039.csv", 
filename="data/test/us-qcew-2024-4-31039.parquet")

In [None]:
for year in range(2014, 2025):
    for qtr in range(1, 5):
        pull_qcew_file(year=year,qtr=qtr,county="31039")

In [None]:
def batch_download(file_map: dict, max_workers: int = 4, verify: bool = True):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(pull_qcew_file, url, filename, verify): (url, filename)
            for url, filename in file_map.items()
        }

        for future in as_completed(futures):
            url, filename = futures[future]
            try:
                future.result()
            except Exception as e:
                print(f"Failed to download {url}: {e}")


batch_download(url_dict, max_workers=4)
