In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()


In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()

pkg.set_sys_path()

pkg

In [39]:
fn = pkg.reference('gss_2021_src').resolved_url.get_resource().get_target().fspath
df = pd.read_stata(fn)
df['year'] = df['year'].astype(int)
df.head()

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  df = pd.read_stata(fn)


Unnamed: 0,year,id,wrkslf,wrkgovt,occ10,prestg10,indus10,marital,martype,divorce,...,relitennv,biblenv,postlifenv,kidssolnv,uscitznnv,fucitznnv,fepolnv,scibnftsv,abanyg,fileversion
0,2021,1,someone else,,receptionists and information clerks,38.0,offices of dentists,married,,no,...,not very strong,inspired word,no,somewhat worse,,,,about equal (phone mode only: volunteered),no,7221.3
1,2021,2,someone else,,advertising and promotions managers,57.0,advertising and related services,divorced,,,...,no religion,,,,,,,benefits greater,,7221.3
2,2021,3,someone else,,miscellaneous assemblers and fabricators,35.0,furniture and home furnishings stores,never married,,,...,,,,,,,disagree,,yes,7221.3
3,2021,4,self-employed,,childcare workers,35.0,child day care services,widowed,,no,...,,,,,,,,,,7221.3
4,2021,6,someone else,,insurance claims and policy processing clerks,38.0,insurance carriers and related activities,never married,,,...,not very strong,inspired word,yes,,a u.s. citizen,,disagree,,yes,7221.3


year              int64
id             category
wrkslf         category
wrkgovt        category
occ10          category
                 ...   
fucitznnv      category
fepolnv        category
scibnftsv      category
abanyg         category
fileversion    category
Length: 735, dtype: object

In [5]:
fn = pkg.reference('gss_2021_codes').resolved_url.get_resource().get_target().fspath
fn

PosixPath('/Volumes/SSD_Extern/metapack/gss.norc.org/Documents/stata/2021_stata.zip_d/GSS 2021 Codebook.pdf')

In [6]:
from pylib.codebook import get_codebook_text

lines = get_codebook_text(pkg)

len(lines) # lines[100:200]


117697

In [7]:
# Get the variable descriptions out of the codebook
d = {}
pp = 0
state = None
for l in lines:
    l = l.strip()
    
    if l.startswith('Variable:'):
        state = 'var'
        _, var_name = l.split(': ')
       
        #if var_name == 'WIDOWED':
        #    pp = 30
    
        d[var_name] = ''
        
    elif l.startswith('Label:'):
        state = 'label'
        try:
            _, text = l.split(': ')
            d[var_name] += ' '+text.strip()
        except ValueError:
            pass
        except KeyError:
            #print('Err:', l)
            pass
                 
    elif l.startswith('LABEL') or l.startswith('VALUE COUNT') or l.startswith('Notes:') :
        state = None
        var_name = None
        
    elif state == 'label':
        d[var_name] += ' '+l.strip()
 
    if pp > 0:
        pp -= 1
        print(state, l)
        


In [8]:
variables_df = pd.DataFrame([ dict(column=k.lower(), desc=v) for k, v in d.items() ])
variables_df.head()

Unnamed: 0,column,desc
0,wrkstat,"Last week were you working full time, part ti..."
1,cohort,Birth cohort of respondent.
2,zodiac,ASTROLOGICAL SIGN OF RESPONDENT
3,hrs1,"IF WORKING, FULL OR PART TIME: How many hours..."
4,hrs2,"IF WITH A JOB, BUT NOT AT WORK: How many hour..."


In [9]:
from pandas.api.types import is_categorical_dtype

def cat_map(s):
    """ Produce a dict of categorical mappings. 
    The faster alternative, dict( enumerate(df[c].cat.categories ) ), misses the codes assocated with NaNs"""
    
    return dict( zip( s.cat.codes, s ) )

rows = []
for c in df.columns:
    s = df[c]
    
    # Convert identifier colums from categories to ints
    if is_categorical_dtype(s.dtype):
        if float(len(s.cat.categories)) / len(s) > .5:
            df[c] = df[c].cat.codes.astype(int)
        else:
            for k, v in cat_map(df[c]).items():
                rows.append({
                    'column': c,
                    'label': v, 
                    'code': k
                })
            
labels_df = pd.DataFrame(rows)
labels_df.head()
        

Unnamed: 0,column,label,code
0,wrkslf,someone else,1
1,wrkslf,self-employed,0
2,wrkslf,,-1
3,wrkgovt,,-1
4,occ10,receptionists and information clerks,265


In [77]:
# Now that we've extracted the labels, convert to integers
cat_cols =df.select_dtypes(include='category').columns

for c in cat_cols:
    df[c] = df[c].cat.codes

df.head()

Unnamed: 0,year,id,wrkslf,wrkgovt,occ10,prestg10,indus10,marital,martype,divorce,...,relitennv,biblenv,postlifenv,kidssolnv,uscitznnv,fucitznnv,fepolnv,scibnftsv,abanyg,fileversion
0,2021,0,1,-1,265,21,196,0,-1,1,...,1,1,1,3,-1,-1,-1,1,1,0
1,2021,1,1,-1,3,40,179,2,-1,-1,...,3,-1,-1,-1,-1,-1,-1,0,-1,0
2,2021,2,1,-1,341,18,108,4,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,0,0
3,2021,3,0,-1,223,18,208,1,-1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,0
4,2021,4,1,-1,282,21,166,4,-1,-1,...,1,1,0,-1,0,-1,1,-1,0,0


In [37]:
labels_df[labels_df.column == 'age']

def label_type(labels):
    def is_num(c, v):
        try:
            c(v)
            return True
        except:
            return False
            
       
    if all([is_num(int,e) for e in labels]):
        return int
    elif all([is_num(float,e) for e in labels]):
        return float
    else:
        return str
    


Unnamed: 0,column,label,code
2950,age,65.0,47
2951,age,60.0,42
2952,age,,-1
2953,age,33.0,15
2954,age,20.0,2
...,...,...,...
3018,age,77.0,59
3019,age,83.0,65
3020,age,19.0,1
3021,age,86.0,68


In [92]:
def isnan(v):
    import math
    try:
        return math.isnan(v)
    except TypeError:
        return False
    
def make_label_dict(g):
    d = { r.code:r.label if not isnan(r.label) else 'NA' for idx, r in g.iterrows() }
    return d

age_df = labels_df[labels_df.column == 'divorce']
d = make_label_dict(age_df)

t = df.copy()

t['divorce'].astype('category').cat.rename_categories(d)


0       no
1       NA
2       NA
3       no
4       NA
        ..
4027    NA
4028    no
4029    no
4030    NA
4031    no
Name: divorce, Length: 4032, dtype: category
Categories (3, object): ['NA', 'yes', 'no']

In [93]:
def convert_to_categorical(df, labels_df):
    import math

    t = df.copy()
    
    for col_name, g in labels_df.groupby('column'):
        d = make_label_dict(g)
        try:
            t[col_name] = t[col_name].astype('category').cat.rename_categories(d)
        except Exception as e:
            print(col_name, d, e)
        
    return t

convert_to_categorical(df, labels_df)

Unnamed: 0,year,id,wrkslf,wrkgovt,occ10,prestg10,indus10,marital,martype,divorce,...,relitennv,biblenv,postlifenv,kidssolnv,uscitznnv,fucitznnv,fepolnv,scibnftsv,abanyg,fileversion
0,2021,0,someone else,,receptionists and information clerks,38.0,offices of dentists,married,,no,...,not very strong,inspired word,no,somewhat worse,,,,about equal (phone mode only: volunteered),no,7221.3
1,2021,1,someone else,,advertising and promotions managers,57.0,advertising and related services,divorced,,,...,no religion,,,,,,,benefits greater,,7221.3
2,2021,2,someone else,,miscellaneous assemblers and fabricators,35.0,furniture and home furnishings stores,never married,,,...,,,,,,,disagree,,yes,7221.3
3,2021,3,self-employed,,childcare workers,35.0,child day care services,widowed,,no,...,,,,,,,,,,7221.3
4,2021,4,someone else,,insurance claims and policy processing clerks,38.0,insurance carriers and related activities,never married,,,...,not very strong,inspired word,yes,,a u.s. citizen,,disagree,,yes,7221.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4027,2021,4027,,,,,,never married,,,...,not very strong,ancient book,yes,,,,disagree,,yes,7221.3
4028,2021,4028,someone else,,customer service representatives,31.0,electronic shopping,married,,no,...,,,,,,,,,,7221.3
4029,2021,4029,someone else,,"managers, all other",39.0,pharmaceutical and medicine manufacturing,married,,no,...,,,,,,,,benefits greater,,7221.3
4030,2021,4030,someone else,,secondary school teachers,64.0,elementary and secondary schools,divorced,,,...,,inspired word,yes,,,,disagree,,yes,7221.3
