# Chapter 5: Data Class Builders

This chapter covers three data class builders. A data class is a pattern that involves making classes that are simply collections of fields with little to no extra functionality.

The data class builders are:
* collections.namedtuple
* typing.NamedTuple
* @dataclasses.dataclass

In [1]:
# simple example
class Coordinate:
    def __init__(self, lat, lon):
        self.lat = lat
        self.lon = lon

In [2]:
moscow = Coordinate(55.76, 37.62)
moscow

<__main__.Coordinate at 0x25b4bf9fe50>

In [None]:
loc = Coordinate(55.76, 37.62)
loc == moscow # compares object IDs, not actual values. you'd have to convert explicitly:

False

In [4]:
(loc.lat, loc.lon) == (moscow.lat, moscow.lon)

True

## collections.namedtuple

In [7]:
# namedtuple builds a subclass of tuple with the name and fields you specify
from collections import namedtuple

CoordinateNamed = namedtuple('Coordinate', 'lat lon')

moscow = CoordinateNamed(55.756, 37.617)
loc = CoordinateNamed(55.756, 37.617)
issubclass(CoordinateNamed, tuple), moscow == loc

(True, True)

## typing.NamedTuple

In [None]:
# typing.NamedTuple does the same but with type annotations
import typing

CoordinateNamed2 = typing.NamedTuple('Coordinate', [('lat', float), ('lon', float)])
# can also be constructed like this:
# CoordinateNamed2 = typing.NamedTuple('Coordinate', lat=float, lon=float).
# this lets you provide mapping of fields and types with **kwargs
issubclass(CoordinateNamed2, tuple), typing.get_type_hints(CoordinateNamed2)

(True, {'lat': float, 'lon': float})

In [11]:
# you can also construct it as a class
from typing import NamedTuple

class Coordinate(NamedTuple):
    lat: float
    lon: float

    def __str__(self): # override
        ns = 'N' if self.lat >= 0 else 'S'
        we = 'W' if self.lon >= 0 else 'E'
        
        return f'{abs(self.lat):.1f}°{ns}, {abs(self.lon):.1f}°{we}'

In [13]:
issubclass(Coordinate, tuple)

True

## @dataclass

In [None]:
# dataclass decorator reads variable annotations and automatically generates methods
from dataclasses import dataclass

# the body is the exact same, the only difference is that this is a decorator
# this class is a subclass of object, not tuple
@dataclass(frozen=True) # this frozen argument makes it so the data class is immutable, like a tuple should be
class CoordinateDecorated:
    lat: float
    lon: float

    def __str__(self):
        ns = 'N' if self.lat >= 0 else 'S'
        we = 'W' if self.lon >= 0 else 'E'
        
        return f'{abs(self.lat):.1f}°{ns}, {abs(self.lon):.1f}°{we}'


In [14]:
# example of namedtuple

City = namedtuple('City', 'name country population coordnates')
City._fields

('name', 'country', 'population', 'coordnates')

In [15]:
Coordinate = namedtuple('Coordinate', 'lat lon')
delhi_data = ('Delhi NCR', 'IN', 21.935, Coordinate(28.613889, 77.208889))

delhi = City._make(delhi_data)

In [18]:
# _asdict lets you get the fields as a dict, and perhaps serialize to JSON
import json
json.dumps(delhi._asdict())

'{"name": "Delhi NCR", "country": "IN", "population": 21.935, "coordnates": [28.613889, 77.208889]}'

## Type annotations

In [None]:
# Type hints are ways to declare the expected type of function arguments, return values, variables, and attributes

# Type hints are not enforced by the compiler and interpreter. They're for your reading and for IDEs and type checkers

import typing
class Coordinate(typing.NamedTuple):
    lat: float = 0 # var_name: type = default_value
    lon: float = 0

bruh = Coordinate('Bla', None)
bruh

Coordinate(lat='Bla', lon=None)

In [None]:
# the dataclass decorator takes many arguments

# this is its signature:
# @dataclass(*, init=True, repr=True, eq=True, order=False, unsafe_hash=False, frozen=False)
# order generates things like __lt__ and __gt__, but raises exceptions if eq=False or if any of the comparison methods that would be generated are defined or inherited

In [None]:
# Field options

# One field option is providing a default value.
# Mutable defaults are kind of weird, but the @dataclass decorator lets you make them without sharing the same mutable object between invocations
from dataclasses import dataclass, field
@dataclass
class ClubMember:
    name: str
    guests: list = field(default_factory=list) # guests: list = [] will raise an error and tell you to use default_factory
    # athlete: bool = field(default=False, repr=False) # set the default for this field as False, and omit it from being used in __repr__

jeff = ClubMember(name="Jeff E.")
jeff.guests.extend([ClubMember("Don T.", ["The Prince"]), ClubMember("Bill C.")])

sean = ClubMember(name="Sean C.")
sean.name, sean.guests, jeff.name, jeff.guests

('Sean C.',
 [],
 'Jeff E.',
 [ClubMember(name='Don T.', guests=['The Prince']),
  ClubMember(name='Bill C.', guests=[])])

## __post_init__

In [34]:
# __post_init__ does things after __init__, ex. validation or computing field values
from typing import ClassVar

@dataclass
class HackerClubMember(ClubMember):
    all_handles: ClassVar[set[str]] = set() # can also be defined as all_handles: set(), but this upsets some type checkers
    handle: str = ''

    def __post_init__(self):
        cls = self.__class__
        if self.handle == '': # if handle is empty (default), make it the first name of the member
            self.handle = self.name.split()[0]
        
        if self.handle in cls.all_handles: # handles are unique, so handle that
            msg = f'handle {self.handle!r} already exists'
            raise ValueError(msg)

        # add handle to the shared all_handles set
        cls.all_handles.add(self.handle)

grace = HackerClubMember(name="Grace H.")
grace.handle

'Grace'

In [35]:
grace2 = HackerClubMember(name="Grace E.")

ValueError: handle 'Grace' already exists

In [40]:
# grace2 = HackerClubMember(name="Grace E.", handle="Egrace")
grace2.handle

'Egrace'

In [39]:
HackerClubMember.all_handles

{'Egrace', 'Grace'}

In [43]:
# init-only fields are arguments passed to __init__ that are not instance fields
from dataclasses import InitVar

@dataclass
class C:
    i: int
    j: int = None
    database: InitVar[int] = None # pretend database is InitVar[DatabaseType]

    def __post_init__(self, database):
        if self.j is None and database is not None:
            self.j = database ** 2

c = C(10, database=3)
c

C(i=10, j=9)

## Example: Dublin Core Resource Record

In [44]:
# example: Dublin Core Resource Record

from dataclasses import dataclass, field, fields
from typing import Optional
from enum import Enum, auto
import datetime

class ResourceType(Enum):
    BOOK = auto()
    EBOOK = auto()
    VIDEO = auto() # type-safe

@dataclass
class Resource:
    """Media resource description"""

    identifier: str
    title: str = '<untitled>'
    creators: list[str] = field(default_factory=list)
    date: Optional[datetime.date] = None
    type: ResourceType = ResourceType.BOOK
    description: str = ''
    language: str = ''
    subjects: list[str] = field(default_factory=list)

    # repr
    def __repr__(self):
        cls = self.__class__
        cls_name = cls.__name__

        indent = ' ' * 4
        res = [f'{cls_name}(']
        for f in fields(cls):
            value = getattr(self, f.name)
            res.append(f'{indent}{f.name} = {value!r},') # recall !r gives repr instead of str

        res.append(')')
        return '\n'.join(res)


desc = 'Improving the design of existing code'

book = Resource('978-0-13-475759-9', 'Refactoring, 2nd Edition',
                ['Martin Fowler', 'Kent Beck'], datetime.date(2018, 11, 19),
                ResourceType.BOOK, desc, 'EN', 
                ['computer programming', 'OOP'])

book

Resource(
    identifier = '978-0-13-475759-9',
    title = 'Refactoring, 2nd Edition',
    creators = ['Martin Fowler', 'Kent Beck'],
    date = datetime.date(2018, 11, 19),
    type = <ResourceType.BOOK: 1>,
    description = 'Improving the design of existing code',
    language = 'EN',
    subjects = ['computer programming', 'OOP'],
)

## Code Smells

Code smells are patterns that may indicate refactoring is needed.

Data classes are one example - you might look at one then ask yourself what behavior should be in the class. Then, you start refactoring to move that behavior into the data class.

The idea of OOP is to put behavior and data together in a single unit (the class). If a class is used but has no significant behavior on its own, then it's possible that
code dealing with its instances are all over the place, which makes maintenance hard.

That's why you should refactor it to place the behavior inside the class and reduce redundancy.

### When does it make sense to have simple data classes?

#### Data Classes as Scaffolding
When starting out, the data class can just be used as a simple implementation of a class. Over time, the class will have its own methods, but for starting out you can leave it in place.

#### Data Classes as Intermediate Representations
Perhaps you're about to export to JSON or just imported something from somewhere else. In that case, data classes can be used as immutable objects and you can use dict() and ** to transfer.

## Pattern matching with data classes

In [47]:
# Pattern matching with data classes

class City(typing.NamedTuple):
    continent: str
    name: str
    country: str

cities = [ 
    City('Asia', 'Tokyo', 'JP'), 
    City('Asia', 'Delhi', 'IN'), 
    City('North America', 'Mexico City', 'MX'), 
    City('North America', 'New York', 'US'), 
    City('South America', 'São Paulo', 'BR'), 
]

def match_asian_cities():
    results = []
    for city in cities:
        match city:
            case City(continent='Asia'): # match all cities with continent Asia
                results.append(city)
    
    return results

def match_asian_countries():
    results = []
    for city in cities:
        match city:
            case City(continent='Asia', country=cc): # cc is bound to the country field of the matched city
                results.append(cc)
    
    return results

# you can also do the above functions with positional class patterns like
# City('Asia') or City('Asia', _, cc). this is accomplished with __match_args__

match_asian_cities(), match_asian_countries(), City.__match_args__

([City(continent='Asia', name='Tokyo', country='JP'),
  City(continent='Asia', name='Delhi', country='IN')],
 ['JP', 'IN'],
 ('continent', 'name', 'country'))

## Chapter Summary

Three different data class builders. You can get the fields, convert to dicts/JSON, set defaults, and implement your own methods or override existing ones.

Type annotations.

Data and the functions that touch it should be together in the same class. Classes with no logic could mean the logic is in the wrong place.