In [1]:
import polars as pl
from task1.dog_data_analysis import normalize_breed_names, licenses_by_breed, extract_unique_breeds, get_top_names, \
    get_licenses_by_date_range

bronze_data_path:str = 'data/bronze/2017.csv'
silver_data_path:str = 'data/silver/2017.csv'

# Load lazy CSV data
bronze_dog_data:pl.LazyFrame = pl.scan_csv(bronze_data_path, separator=',', has_header=True)
silver_dog_data:pl.LazyFrame = pl.scan_csv(silver_data_path, separator=',', has_header=True)
print(bronze_dog_data.head(2))

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SLICE[offset: 0, len: 2]
  Csv SCAN [data/bronze/2017.csv]
  PROJECT */7 COLUMNS


In [2]:
# Part 1: Normalize Breeds 
normalized_data = normalize_breed_names(bronze_dog_data)
normalized_data.write_csv(silver_data_path, include_header=True)
print("Normalized Data:", normalized_data.head(2))

Normalized Data: shape: (2, 8)
┌─────────────┬─────────────┬─────────────┬─────────┬──────────┬─────────┬────────────┬────────────┐
│ LicenseType ┆ Breed       ┆ Color       ┆ DogName ┆ OwnerZip ┆ ExpYear ┆ ValidDate  ┆ Normalized │
│ ---         ┆ ---         ┆ ---         ┆ ---     ┆ ---      ┆ ---     ┆ ---        ┆ Breed      │
│ str         ┆ str         ┆ str         ┆ str     ┆ i64      ┆ i64     ┆ str        ┆ ---        │
│             ┆             ┆             ┆         ┆          ┆         ┆            ┆ str        │
╞═════════════╪═════════════╪═════════════╪═════════╪══════════╪═════════╪════════════╪════════════╡
│ Dog         ┆ BICHON      ┆ WHITE       ┆ CHLOE   ┆ 15090    ┆ 2017    ┆ 12/15/2016 ┆ bichonfris │
│ Individual  ┆ FRISE       ┆             ┆         ┆          ┆         ┆ 9:58       ┆ e          │
│ Spayed      ┆             ┆             ┆         ┆          ┆         ┆            ┆            │
│ Female      ┆             ┆             ┆         ┆       

In [3]:
# Find Unique Breeds
unique_breeds = extract_unique_breeds(normalized_data)
print("Unique Breeds (SQL):", unique_breeds)
unique_breeds.write_csv("./data/silver/unique_breeds.csv", include_header=True)

Unique Breeds (SQL): shape: (265, 1)
┌───────────────────┐
│ NormalizedBreed   │
│ ---               │
│ str               │
╞═══════════════════╡
│ bichonfrise       │
│ chihuahua         │
│ labmix            │
│ dachshund         │
│ terrier           │
│ …                 │
│ boerboel          │
│ pumi              │
│ amblack&tancoonho │
│ carolinadog       │
│ jindo             │
└───────────────────┘


In [4]:
# Part 2: Counting Licenses by License Type
license_counts = licenses_by_breed(silver_dog_data)
print("License Counts by License Type and Breed (SQL):\n", license_counts)

License Counts by License Type and Breed (SQL):
 shape: (1_332, 3)
┌─────────────────────┬─────────────────────────────────┬──────────────┐
│ Breed               ┆ LicenseType                     ┆ LicenseCount │
│ ---                 ┆ ---                             ┆ ---          │
│ str                 ┆ str                             ┆ u32          │
╞═════════════════════╪═════════════════════════════════╪══════════════╡
│ MIXED               ┆ Dog Individual Spayed Female    ┆ 726          │
│ MIXED               ┆ Dog Individual Neutered Male    ┆ 666          │
│ LABRADOR RETRIEVER  ┆ Dog Individual Spayed Female    ┆ 445          │
│ LAB MIX             ┆ Dog Individual Spayed Female    ┆ 436          │
│ LAB MIX             ┆ Dog Individual Neutered Male    ┆ 423          │
│ …                   ┆ …                               ┆ …            │
│ COLLIE MIX          ┆ Dog Individual Male             ┆ 1            │
│ GR SWISS MTN DOG    ┆ Dog Individual Female           ┆

In [5]:
top_names = get_top_names(silver_dog_data, 5),
print("License Counts by License Type and Breed (SQL):\n", top_names)

License Counts by License Type and Breed (SQL):
 (shape: (5, 2)
┌─────────┬────────────┐
│ DogName ┆ name_count │
│ ---     ┆ ---        │
│ str     ┆ u32        │
╞═════════╪════════════╡
│ BELLA   ┆ 342        │
│ BUDDY   ┆ 257        │
│ MAX     ┆ 209        │
│ BAILEY  ┆ 203        │
│ LUCY    ┆ 189        │
└─────────┴────────────┘,)


In [6]:
start_date = "2016-01-01"
end_date = "2016-12-31"
licenses = get_licenses_by_date_range(silver_dog_data, start_date, end_date)
print("Licenses in range of ValidDate:\n", licenses.head(5))

Licenses in range of ValidDate:
 shape: (5, 2)
┌───────────────┬─────────────────────┐
│ Breed         ┆ ParsedDate          │
│ ---           ┆ ---                 │
│ str           ┆ datetime[μs]        │
╞═══════════════╪═════════════════════╡
│ COCKAPOO      ┆ 2016-11-30 08:50:00 │
│ GER SHEPHERD  ┆ 2016-11-30 08:51:00 │
│ BEAGLE        ┆ 2016-11-30 08:52:00 │
│ DACHSHUND MIX ┆ 2016-11-30 08:52:00 │
│ BLOODHOUND    ┆ 2016-11-30 08:53:00 │
└───────────────┴─────────────────────┘
