# Data Dictionary

Build the data ditionary files

In [63]:
%load_ext autoreload
%autoreload 2
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

from pathlib import Path


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [64]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()

pkg.set_sys_path()

from pylib.codebook import get_codebook_text

pkg

In [3]:
root_dir = Path(pkg.package_url.fspath)
data_dir = root_dir / 'data'
cache_dir = root_dir / 'cache'

if not cache_dir.exists():
    cache_dir.mkdir()

In [4]:
fn = pkg.reference('gss_2021_src').resolved_url.get_resource().get_target().fspath
df = pd.read_stata(fn)
df['year'] = df['year'].astype(int)
df.head()

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  df = pd.read_stata(fn)


Unnamed: 0,year,id,wrkstat,hrs1,hrs2,evwork,wrkslf,wrkgovt,occ10,prestg10,...,biblenv,postlifenv,kidssolnv,uscitznnv,fucitznnv,fepolnv,scibnftsv,abanyg,fileversion,vietdraft
0,2021,1,working full time,36.0,,,someone else,,receptionists and information clerks,38.0,...,inspired word,no,somewhat worse,,,,about equal (phone mode only: volunteered),no,7221.32,
1,2021,2,working full time,45.0,,,someone else,,advertising and promotions managers,57.0,...,,,,,,,benefits greater,,7221.32,
2,2021,3,working part time,16.0,,,someone else,,miscellaneous assemblers and fabricators,35.0,...,,,,,,disagree,,yes,7221.32,
3,2021,4,working part time,27.0,,,self-employed,,childcare workers,35.0,...,,,,,,,,,7221.32,
4,2021,6,working full time,42.0,,,someone else,,insurance claims and policy processing clerks,38.0,...,inspired word,yes,,a u.s. citizen,,disagree,,yes,7221.32,


In [5]:
# Get the variable descriptions out of the codebook
from tqdm.auto import tqdm 

lines = get_codebook_text(pkg)

d = {}
pp = 0
state = None
for l in tqdm(lines):
    l = l.strip()
    
    if l.startswith('Variable:'):
        state = 'var'
        _, var_name = l.split(': ')
       
        #if var_name == 'WIDOWED':
        #    pp = 30
    
        d[var_name] = ''
        
    elif l.startswith('Label:'):
        state = 'label'
        try:
            _, text = l.split(': ', 1)
            d[var_name] += ' '+text.strip()
        except ValueError:
            pass
        except KeyError:
            #print('Err:', l)
            pass
                 
    elif l.startswith('LABEL') or l.startswith('VALUE COUNT') or l.startswith('Notes:') :
        state = None
        var_name = None
        
    elif state == 'label':
        d[var_name] += ' '+l.strip()
 
    if pp > 0:
        pp -= 1
        print(state, l)
        
variables_df = pd.DataFrame([ dict(column=k.lower(), desc=v) for k, v in d.items() ])
variables_df.sample(10).head(10)

Codebook Path /Volumes/SSD_Extern/metapack/gss.norc.org/Documents/stata/2021_stata.zip_d/GSS 2021 Codebook.pdf


  0%|          | 0/117697 [00:00<?, ?it/s]

Unnamed: 0,column,desc
615,gender8,Gender of eighth person
367,spanself,If this interview had only been available in ...
512,raceacs14,Other Pacific Islander
934,evpaidsx,Thinking about the time since your 18th birth...
280,absingle,If she is not married and does not want to ma...
256,wksup,"At work, [do you/does your spouse] supervise ..."
285,pillok,"Do you strongly agree, agree, disagree, or st..."
989,rgroomed,HOW WELL-GROOMED IS THE RESPONDENT?
572,relate1,(FIRST PERSON) Relationship of first person t...
40,mawrkslf,"At this job, was [mother/stepmother/female re..."


In [19]:
from more_itertools import windowed

# create overlapping chunks of variables the overlap is 20% of the size of the block.
chunk_size = 10
step = chunk_size-5
df_chunks = list(windowed(variables_df.iterrows(), chunk_size, step=step))
print(len(variables_df), len(variables_df)/chunk_size, len(df_chunks))

def make_desc(chunk):
    return '\n'.join([f"{r[1].column}: {r[1].desc} " for r in chunk])

1002 100.2 200


In [20]:
import  pylib.descriptions as desc
from pylib.openai import openai_one_completion
import json

prompt_templ = Path(desc.__file__).parent.joinpath('rewrite_descriptions.txt').read_text()
descriptions = make_desc(df_chunks[1])
prompt = prompt_templ.format(descriptions=descriptions)

with open('prompt.txt', 'w') as f:
    f.write(prompt)

r = openai_one_completion(prompt)
for l in r.splitlines(): # Just check that they parse.
    json.loads(l)


In [21]:
print(r)

{"variable": "evwork", "desc": "IF RETIRED, IN SCHOOL, KEEPING HOUSE, OR OTHER: Did you ever work for as long as one year?" }
{"variable": "wrkslf", "desc": "Are you self employed or do you work for someone else?" }
{"variable": "occ10", "desc": "Respondent's occupation" }
{"variable": "prestg10", "desc": "Prestige of respondent's occupation" }
{"variable": "prestg105plus", "desc": "Respondent's occupational prestige score using threshold method" }
{"variable": "indus10", "desc": "Respondent's industry" }
{"variable": "marital", "desc": "Are you currently married, widowed, divorced, separated, or have you never been married?" }
{"variable": "martype", "desc": "Code type of marriage" }
{"variable": "divorce", "desc": "If currently married or widowed: Have you ever been divorced or legally separated?" }
{"variable": "widowed", "desc": "If currently married, separated, or divorced: Have you ever been widowed?" }


In [23]:

from tenacity import retry, stop_after_attempt

@retry(stop=stop_after_attempt(5))
def run_rewrite_descriptions(chunks):
    from pylib.openai import openai_one_completion
    import  pylib.descriptions as desc
    from pathlib import Path
    import pickle

    cache_file = cache_dir/'rw.cache'
    
    prompt_templ = Path(desc.__file__).parent.joinpath('rewrite_descriptions.txt').read_text()

    if cache_file.exists():
        with cache_file.open('rb') as f:
            print("Loading cached responses")
            c = pickle.load(f)
            responses = c['responses']
            last_chunk_index = c['last_chunk_index']
    else:
        responses = []
        last_chunk_index = 0 
    
    print("last_chunk_index", last_chunk_index)

    for i, chunk in enumerate(tqdm(chunks)):
        
        if i < last_chunk_index:
            continue

        descriptions = make_desc(chunk)
        prompt = prompt_templ.format(descriptions=descriptions)

        try:
            r = openai_one_completion(prompt)
            responses.append(r)
        except Exception as e:
            print("Openai error", e)
            raise # Let the retry decorator handle it

        for l in r.splitlines(): # Just check that they parse.
            try:
                json.loads(l)
            except Exception as e:
                print("Chunk", i, "Can't parse response", l,e)

        sv = {
            'last_chunk_index': i,
            'chunks': chunks,
            'responses': responses
        }
    
        with cache_file.open('wb') as f:
            pickle.dump(sv, f)
    
run_rewrite_descriptions(df_chunks)

Loading cached responses
last_chunk_index 1


  0%|          | 0/200 [00:00<?, ?it/s]

Can't parse response {"variable": "major1",  Expecting property name enclosed in double quotes: line 1 column 24 (char 23)
Can't parse response "desc":"What was your major or field of study when you received your (respondent's college degree) degree? If the respondent received more than one graduate level degree, ask about the highest degree obtained (e.g. a Ph.D. rather than an M.B.A)."} Extra data: line 1 column 7 (char 6)
Can't parse response { "variable": "racdif3",  Expecting property name enclosed in double quotes: line 1 column 26 (char 25)
Can't parse response "desc":  Extra data: line 1 column 7 (char 6)
Can't parse response "Do you think that on the average Negroes/Blacks/African-Americans have worse jobs, income, and housing than white people because most of them don't have the chance for education that it takes to rise out of poverty?" } Extra data: line 1 column 219 (char 218)
Can't parse response {"variable": Expecting value: line 1 column 13 (char 12)
Openai error Reques

  0%|          | 0/200 [00:00<?, ?it/s]

Can't parse response {"variable": "wordh", "desc":  We would like to know something about how people go about guessing words they do not know. On this card are listed some words - you may know some of them, and you may not Expecting value: line 1 column 32 (char 31)
Can't parse response { Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Can't parse response   "variable": "spsei10inc", Extra data: line 1 column 13 (char 12)
Can't parse response   "desc": "Percentage of $45K+ earners in SPOCC10 based on ACS 2010." Extra data: line 1 column 9 (char 8)
Can't parse response } Expecting value: line 1 column 1 (char 0)
Can't parse response { Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Can't parse response   "variable": "cosei10", Extra data: line 1 column 13 (char 12)
Can't parse response   "desc": "Respondent's partner's socioeconomic index (2010)." Extra data: line 1 column 9 (char 8)
Can't parse response } Expecting value: line 1 c

  0%|          | 0/200 [00:00<?, ?it/s]

Loading cached responses
last_chunk_index 198


  0%|          | 0/200 [00:00<?, ?it/s]

Loading cached responses
last_chunk_index 198


  0%|          | 0/200 [00:00<?, ?it/s]

RetryError: RetryError[<Future at 0x181e6cf40 state=finished raised TypeError>]

In [54]:
import json
import pickle

cache_file = cache_dir/'rw.cache'
with cache_file.open('rb') as f:
    print("Loading cached responses")
    c = pickle.load(f)
    responses = c['responses']
    last_chunk_index = c['last_chunk_index'] 

# Preload dict with original description
d = {}
for idx, r in variables_df.iterrows():
    d[r.column] = [r.desc.strip()]
    
for i, r in enumerate(c['responses']):
    for l in r.splitlines(): # Just check that they parse.
            try:
                e = json.loads(l)
                desc = e['desc'].strip()
                if desc != d[e['variable']][-1]:
                    d[e['variable']].append(e['desc'])
            except Exception as e:
                print("Chunk", i, "Can't parse response", l,e)

Loading cached responses
Chunk 11 Can't parse response {"variable": "major1",  Expecting property name enclosed in double quotes: line 1 column 24 (char 23)
Chunk 11 Can't parse response "desc":"What was your major or field of study when you received your (respondent's college degree) degree? If the respondent received more than one graduate level degree, ask about the highest degree obtained (e.g. a Ph.D. rather than an M.B.A)."} Extra data: line 1 column 7 (char 6)
Chunk 66 Can't parse response { "variable": "racdif3",  Expecting property name enclosed in double quotes: line 1 column 26 (char 25)
Chunk 66 Can't parse response "desc":  Extra data: line 1 column 7 (char 6)
Chunk 66 Can't parse response "Do you think that on the average Negroes/Blacks/African-Americans have worse jobs, income, and housing than white people because most of them don't have the chance for education that it takes to rise out of poverty?" } Extra data: line 1 column 219 (char 218)
Chunk 85 Can't parse respon

In [55]:
# Coalesce. Arbitrarily pick the last description
dl_rows = [ {'column':k,  'desc':v[-1]} for k, v in d.items() ]
variables_updated_df = pd.DataFrame(dl_rows)

# Just saving the variables because we can. The data will always be accessed from the metadata
variables_updated_df.to_csv('../data/variables.csv')
variables_updated_df.sample(10)

Unnamed: 0,column,desc
14,widowed,"If currently married, separated, or divorced, ..."
564,spsei10educ,Percentage of some college education in SPOCC1...
435,ethregion34,From what country or countries did your ancest...
552,dateintv,Date of interview.
927,matesex,Was one of the partners your husband or wife o...
829,trmedia,"On a scale of 0 to 10, how much do you persona..."
193,relitenv,Would the respondent call themselves a strong ...
80,adults,Number of members over 17 years old.
96,vote16,"In 2016, you remember that Hillary Clinton ran..."
557,pasei10,Respondent's father's socioeconomic index (2010).


In [60]:
descs['martype']

'Code type of marriage.'

In [65]:
# Update the column descriptions

descs = {r.column:r.desc for idx, r in variables_updated_df.iterrows() }
changes = 0
st = pkg.resource('gss_2021').schema_term

for c in st.children:
    col_name = c.name

    desc_t = c.get_or_new_child('Description')

    if desc_t.value != descs.get(col_name):
        changes += 1
        desc_t.value = descs.get(col_name)


print(changes, 'changes')
if changes:
    pkg.write()

679 changes


In [67]:
from pandas.api.types import is_categorical_dtype

def cat_map(s):
    """ Produce a dict of categorical mappings. 
    The faster alternative, dict( enumerate(df[c].cat.categories ) ), misses the codes assocated with NaNs"""
    
    return dict( zip( s.cat.codes, s ) )

rows = []
for c in df.columns:
    s = df[c]
    
    # Convert identifier columns from categories to ints
    if is_categorical_dtype(s.dtype):
        if float(len(s.cat.categories)) / len(s) > .5:
            df[c] = df[c].cat.codes.astype(int)
        else:
            for k, v in cat_map(df[c]).items():
                rows.append({
                    'column': c,
                    'label': v, 
                    'code': k
                })
            
labels_df = pd.DataFrame(rows)
labels_df.head()

labels_df.to_csv('../data/_labels.csv')