# Data model to Data schema
------

## Datamodel

In [None]:
from base64 import b64encode
from IPython.display import Image, display
from json_ntv import Ntv, MermaidConnec

In [18]:
country = { 
    'country and region:$erDiagram' : { 
        'entity': {
            'COUNTRY':  [ 
                ['string', 'country',  'PK' ], 
                ['string', 'code', 'code is unique'] 
            ], 
            'REGION': [ 
                ['string', 'region',  'PK'],
                ['number',    'population'] 
            ]
        },
        'relationship': [ 
            [ 'REGION', 'exactly one', 'identifying', 'one or more', 'COUNTRY',     'brings_together']
        ],

     } }

diag = MermaidConnec.diagram(country)
display(Image(url="https://mermaid.ink/img/" + b64encode(diag.encode("ascii")).decode("ascii")))

## Table schema

The default relationship between an attribute and the PK of the entity is "derived".
The derived relationships with the "primary key" of the Table Schema is implicit (the "primary key" is not duplicated).

The deduced Data Schema is as follows:

```json
"schema": {
  "fields": [
    {"name": "country",    "type": "string"},
    {"name": "region",     "type": "string"},
    {"name": "code",       "type": "string"},
    {"name": "population", "type": "number"},
  ],
  "primaryKey": "country"
  "relationships":
      { "fields" : [ "region", "population"], "description" : "attributes",      "link" : "derived" },
      { "fields" : [ "region", "country"],    "description" : "brings_together", "link" : "derived" },
 }
 ```

The indication that the country code is unique for a country reinforces the relationship between "code" and "country" (it was "derived" and is now "coupled").
So this relationship is added in the schema.
To be consistent with the Data-model we can add the relationship between entities (but this constrinst will be always True).

```json
"schema": {
  "fields": [
    {"name": "country",    "type": "string"},
    {"name": "region",     "type": "string"},
    {"name": "code",       "type": "string"},
    {"name": "population", "type": "number"},
  ],
  "primaryKey": "country"
  "relationships": [
      { "fields" : [ "country", "code"],      "description" : "attributes",      "link" : "coupled" },
      { "fields" : [ "region", "population"], "description" : "attributes",      "link" : "derived" },
      { "fields" : [ "contry", "region"],    "description" : "brings_together", "link" : "derived" }
  ]
 }
 ```

## Example : before check

In [21]:
import pandas as pd
import ntv_pandas as npd

| country | region         | code  | population |
|---------|----------------|-------|------------|
| France  | European Union | FR    | 449        |
| Spain   | European Union | ES    | 48         |
| Estonia | European Union | ES    | 449        |
| Nigeria | Africa         | NI    | 1460       |

In [24]:
example1 = {'country' :   ['France', 'Spain', 'Estonia', 'Nigeria'],
            'region':     ['European Union', 'European Union', 'European Union', 'Africa'],
            'code':       ['FR', 'ES', 'ES', 'NI'],
            'population': [449, 48, 449, 1460]}
ex1 = pd.DataFrame(example1)

In [31]:
ana1 = ex1.npd.analysis()
print("country - code : ", ana1.get_relation('country', 'code').typecoupl)
print("region - population : ", ana1.get_relation('region', 'population').typecoupl, 
      ana1.get_relation('region', 'population').parent_child)
print("country - region : ", ana1.get_relation('country', 'region').typecoupl,
     ana1.get_relation('country', 'region').parent_child)

country - code :  derived
region - population :  derived False
country - region :  derived True


## Example : after corrections

| country | region         | code  | population |
|---------|----------------|-------|------------|
| France  | European Union | FR    | 449        |
| Spain   | European Union | ES    | 449         |
| Estonia | European Union | EE    | 449        |
| Nigeria | Africa         | NI    | 1460       |

In [32]:
example1 = {'country' :   ['France', 'Spain', 'Estonia', 'Nigeria'],
            'region':     ['European Union', 'European Union', 'European Union', 'Africa'],
            'code':       ['FR', 'ES', 'EE', 'NI'],
            'population': [449, 449, 449, 1460]}
ex1 = pd.DataFrame(example1)

In [33]:
ana1 = ex1.npd.analysis()
print("country - code : ", ana1.get_relation('country', 'code').typecoupl)
print("region - population : ", ana1.get_relation('region', 'population').typecoupl, 
      ana1.get_relation('region', 'population').parent_child)
print("country - region : ", ana1.get_relation('country', 'region').typecoupl,
     ana1.get_relation('country', 'region').parent_child)

country - code :  coupled
region - population :  coupled True
country - region :  derived True
