# Data Dictionary

Build the data ditionary files

In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()



In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()

pkg.set_sys_path()

from pylib.codebook import get_codebook_text

pkg

In [3]:
fn = pkg.reference('gss_2021_src').resolved_url.get_resource().get_target().fspath
df = pd.read_stata(fn)
df['year'] = df['year'].astype(int)
df.head()

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  df = pd.read_stata(fn)


Unnamed: 0,year,id,wrkstat,hrs1,hrs2,evwork,wrkslf,wrkgovt,occ10,prestg10,...,biblenv,postlifenv,kidssolnv,uscitznnv,fucitznnv,fepolnv,scibnftsv,abanyg,fileversion,vietdraft
0,2021,1,working full time,36.0,,,someone else,,receptionists and information clerks,38.0,...,inspired word,no,somewhat worse,,,,about equal (phone mode only: volunteered),no,7221.32,
1,2021,2,working full time,45.0,,,someone else,,advertising and promotions managers,57.0,...,,,,,,,benefits greater,,7221.32,
2,2021,3,working part time,16.0,,,someone else,,miscellaneous assemblers and fabricators,35.0,...,,,,,,disagree,,yes,7221.32,
3,2021,4,working part time,27.0,,,self-employed,,childcare workers,35.0,...,,,,,,,,,7221.32,
4,2021,6,working full time,42.0,,,someone else,,insurance claims and policy processing clerks,38.0,...,inspired word,yes,,a u.s. citizen,,disagree,,yes,7221.32,


In [4]:
# Get the variable descriptions out of the codebook

lines = get_codebook_text(pkg)


d = {}
pp = 0
state = None
for l in lines:
    l = l.strip()
    
    if l.startswith('Variable:'):
        state = 'var'
        _, var_name = l.split(': ')
       
        #if var_name == 'WIDOWED':
        #    pp = 30
    
        d[var_name] = ''
        
    elif l.startswith('Label:'):
        state = 'label'
        try:
            _, text = l.split(': ')
            d[var_name] += ' '+text.strip()
        except ValueError:
            pass
        except KeyError:
            #print('Err:', l)
            pass
                 
    elif l.startswith('LABEL') or l.startswith('VALUE COUNT') or l.startswith('Notes:') :
        state = None
        var_name = None
        
    elif state == 'label':
        d[var_name] += ' '+l.strip()
 
    if pp > 0:
        pp -= 1
        print(state, l)
        
variables_df = pd.DataFrame([ dict(column=k.lower(), desc=v) for k, v in d.items() ])
variables_df.head()

Unnamed: 0,column,desc
0,wrkstat,"Last week were you working full time, part ti..."
1,cohort,Birth cohort of respondent.
2,zodiac,ASTROLOGICAL SIGN OF RESPONDENT
3,hrs1,"IF WORKING, FULL OR PART TIME: How many hours..."
4,hrs2,"IF WITH A JOB, BUT NOT AT WORK: How many hour..."


In [5]:
from pandas.api.types import is_categorical_dtype

def cat_map(s):
    """ Produce a dict of categorical mappings. 
    The faster alternative, dict( enumerate(df[c].cat.categories ) ), misses the codes assocated with NaNs"""
    
    return dict( zip( s.cat.codes, s ) )

rows = []
for c in df.columns:
    s = df[c]
    
    # Convert identifier colums from categories to ints
    if is_categorical_dtype(s.dtype):
        if float(len(s.cat.categories)) / len(s) > .5:
            df[c] = df[c].cat.codes.astype(int)
        else:
            for k, v in cat_map(df[c]).items():
                rows.append({
                    'column': c,
                    'label': v, 
                    'code': k
                })
            
labels_df = pd.DataFrame(rows)
labels_df.head()
        

Unnamed: 0,column,label,code
0,wrkstat,working full time,0
1,wrkstat,working part time,1
2,wrkstat,retired,4
3,wrkstat,other,7
4,wrkstat,in school,5


In [8]:
variables_df

Unnamed: 0,column,desc
0,wrkstat,"Last week were you working full time, part ti..."
1,cohort,Birth cohort of respondent.
2,zodiac,ASTROLOGICAL SIGN OF RESPONDENT
3,hrs1,"IF WORKING, FULL OR PART TIME: How many hours..."
4,hrs2,"IF WITH A JOB, BUT NOT AT WORK: How many hour..."
...,...,...
997,sample,The GSS has employed various sampling procedu...
998,oversamp,Weight for Black oversamples.
999,phase,
1000,spaneng,Interviews Conducted in Spanish or English


In [None]:
# Just saving the variables because we can. The data will always be accessed from the metadata
variables_df.to_csv('../data/variables.csv')
labels_df.to_csv('../data/labels.csv')

In [None]:
# Update the column descriptions

descs = {r.column:r.desc for idx, r in variables_df.iterrows() }
changes = 0
st = pkg.resource('gss_2021').schema_term

for c in st.children:
    col_name = c.name
    
    desc_t = c.get_or_new_child('Description')
    
    if not desc_t.value:
        if desc_t.value != descs.get(col_name):
            change += 1
            desc_t.value = descs.get(col_name)
        
    
print(changes, 'changes')
if changes:
    pkg.write()


In [None]:
variables_df

In [None]:
labels_df.head()