## Normalization on Yaml Files

Following is an implementation of normalization on `.yaml` files.

### Imports

In [1]:
import sys
sys.path.insert(0, '..')

In [2]:
import os
import logging

import yaml
import pandas as pd
import flat_table as norm

In [3]:
logging.basicConfig(level=logging.WARNING)

### Data

The data used here is from [unitedstates/congress-legislators](https://github.com/unitedstates/congress-legislators) repository.

- legislators-historical.yml
- executive.yml').read
- committees-current.yml

Download and put them into `data` folder.

In [4]:
def load_data(filepath):
    with open(filepath) as f:
        contents = yaml.load(f, Loader=yaml.SafeLoader)
    return contents

In [5]:
source = load_data('../data/legislators-historical.yml')

In [6]:
source2 = load_data('../data/executive.yml')
source3 = load_data('../data/committees-current.yml')

### Analysis

A seri can be transformed into:

    1. Expanded into rows
    2. Expendad to columns
    3. Kept as is
   
    * Each individual seri has parent df name, self name, and self.
    * if parent and self has the same name ==> it is an object
    * if parent and 
    

#### Source 1

In [7]:
df1 = pd.DataFrame(source)
df1_in = norm.mapper(df1)
df1_in.shape

(68, 4)

In [8]:
df1.shape

(11982, 7)

In [9]:
df1_in.tail()

Unnamed: 0,parent,child,type,obj
63,leadership_roles,leadership_roles.title,str,0 NaN 1 NaN 2 NaN 3 ...
64,.,family,list,0 ...
65,,family,dict,0 ...
66,family,family.relation,str,0 NaN 1 NaN 2 NaN 3 ...
67,family,family.name,str,0 NaN 1 ...


In [10]:
df1.sample()

Unnamed: 0,id,name,bio,terms,other_names,leadership_roles,family
11576,"{'bioguide': 'M000472', 'thomas': '00773', 'go...","{'first': 'John', 'middle': 'M.', 'last': 'McH...","{'birthday': '1948-09-29', 'gender': 'M'}","[{'type': 'rep', 'start': '1993-01-05', 'end':...",,,


In [11]:
df_norm = norm.normalize(df1_in, is_mapper=True)

In [12]:
df_norm.shape

(275401, 54)

In [13]:
df_norm.columns

Index(['index', 'id.maplight', 'id.opensecrets', 'id.ballotpedia', 'id.lis',
       'id.votesmart', 'id.cspan', 'id.thomas', 'id.house_history_alternate',
       'id.house_history', 'id.google_entity_id', 'id.wikidata',
       'id.wikipedia', 'id.icpsr', 'id.govtrack', 'id.bioguide', 'id.fec',
       'id.bioguide_previous', 'name.official_full', 'name.suffix',
       'name.nickname', 'name.middle', 'name.last', 'name.first', 'bio.gender',
       'bio.birthday', 'terms.rss_url', 'terms.state_rank', 'terms.office',
       'terms.contact_form', 'terms.fax', 'terms.phone', 'terms.address',
       'terms.url', 'terms.how', 'terms.district', 'terms.party',
       'terms.class', 'terms.state', 'terms.end', 'terms.start', 'terms.type',
       'terms.party_affiliations.party', 'terms.party_affiliations.end',
       'terms.party_affiliations.start', 'other_names.last',
       'other_names.middle', 'other_names.end', 'leadership_roles.end',
       'leadership_roles.start', 'leadership_roles.chamb

#### Source 2

In [14]:
df2 = pd.DataFrame(source2)[:1]
df2_in = norm.mapper(df2)
df2_in.shape

(17, 4)

In [15]:
df2.shape

(1, 4)

In [16]:
df2_in.head()

Unnamed: 0,parent,child,type,obj
0,.,id,dict,"0 {'bioguide': 'W000178', 'govtrack': 41135..."
1,id,id.icpsr_prez,int,"0 99869 Name: id.icpsr_prez, dtype: object"
2,id,id.govtrack,int,"0 411351 Name: id.govtrack, dtype: object"
3,id,id.bioguide,str,"0 W000178 Name: id.bioguide, dtype: object"
4,.,name,dict,"0 {'first': 'George', 'last': 'Washington'}..."


In [17]:
norm.normalize(df2, expand_dicts=False, is_mapper=False)

Unnamed: 0,index,id,name,bio,terms.how,terms.party,terms.end,terms.start,terms.type
0,0,"{'bioguide': 'W000178', 'govtrack': 411351, 'i...","{'first': 'George', 'last': 'Washington'}","{'birthday': '1732-02-22', 'gender': 'M'}",election,no party,1793-03-04,1789-04-30,prez
1,0,"{'bioguide': 'W000178', 'govtrack': 411351, 'i...","{'first': 'George', 'last': 'Washington'}","{'birthday': '1732-02-22', 'gender': 'M'}",election,no party,1797-03-04,1793-03-04,prez


#### Source 3

In [18]:
df3 = pd.DataFrame(source3)
df3_in = norm.mapper(df3)
df3_in.shape

(22, 4)

In [19]:
df3.shape

(49, 16)

In [20]:
df3_in.head()

Unnamed: 0,parent,child,type,obj
0,.,type,str,0 house 1 house 2 house 3 ...
1,.,name,str,0 House Committee on Ag...
2,.,url,str,0 https://agriculture.h...
3,.,minority_url,str,0 https://republicans-agriculture.h...
4,.,thomas_id,str,0 HSAG 1 HSAP 2 HSAS 3 HSBA 4 ...


In [21]:
norm.normalize(df3_in)

Unnamed: 0,index,parent,child,type,obj
0,0,.,type,str,0 house 1 house 2 house 3 ...
1,1,.,name,str,0 House Committee on Ag...
2,2,.,url,str,0 https://agriculture.h...
3,3,.,minority_url,str,0 https://republicans-agriculture.h...
4,4,.,thomas_id,str,0 HSAG 1 HSAP 2 HSAS 3 HSBA 4 ...
5,5,.,house_committee_id,str,0 AG 1 AP 2 AS 3 BA 4 ...
6,6,.,subcommittees,list,"0 [{'name': 'Conservation and Forestry', '..."
7,7,,subcommittees,dict,"0 {'name': 'Conservation and Forestry', 't..."
8,8,subcommittees,subcommittees.wikipedia,str,0 ...
9,9,subcommittees,subcommittees.phone,str,0 (202) 225-2171 0 (202) 225-2171 0 ...
