In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()


In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()

pkg.set_sys_path()

pkg

In [16]:
from pandas.api.types import is_categorical_dtype

fn = pkg.reference('gss_2021_src').resolved_url.get_resource().get_target().fspath
df = pd.read_stata(fn)
df['year'] = df['year'].astype(int)

def cat_map(s):
    """ Produce a dict of categorical mappings. 
    The faster alternative, dict( enumerate(df[c].cat.categories ) ), misses the codes assocated with NaNs"""
    
    return dict( zip( s.cat.codes, s ) )

rows = []
for c in df.columns:
    s = df[c]
    
    # Convert identifier columns from categories to ints
    if is_categorical_dtype(s.dtype):

        # Weights, id, and a few others should not have been categorical.
        if c in ('wtssps', 'wtssnrps', 'id','vpsu', 'vstrat', 'sampcode', 'year', 'prestg10'):
            df[c] = df[c].astype(float)
        else:
            for k, v in cat_map(df[c]).items():
                rows.append({
                    'column': c,
                    'label': v, 
                    'code': k
                })
            
labels_df = pd.DataFrame(rows)
labels_df.head()


One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  df = pd.read_stata(fn)


Unnamed: 0,column,label,code
0,wrkstat,working full time,0
1,wrkstat,working part time,1
2,wrkstat,retired,4
3,wrkstat,other,7
4,wrkstat,in school,5


In [5]:
# Now that we've extracted the labels, convert to integers
cat_cols =df.select_dtypes(include='category').columns

for c in cat_cols:
    df[c] = df[c].cat.codes

df.head()

Unnamed: 0,year,id,wrkstat,hrs1,hrs2,evwork,wrkslf,wrkgovt,occ10,prestg10,...,biblenv,postlifenv,kidssolnv,uscitznnv,fucitznnv,fepolnv,scibnftsv,abanyg,fileversion,vietdraft
0,2021,0,0,35,-1,-1,1,-1,265,21,...,1,1,3,-1,-1,-1,1,1,0,-1
1,2021,1,0,44,-1,-1,1,-1,3,40,...,-1,-1,-1,-1,-1,-1,0,-1,0,-1
2,2021,2,1,16,-1,-1,1,-1,341,18,...,-1,-1,-1,-1,-1,1,-1,0,0,-1
3,2021,3,1,27,-1,-1,0,-1,223,18,...,-1,-1,-1,-1,-1,-1,-1,-1,0,-1
4,2021,4,0,41,-1,-1,1,-1,282,21,...,1,0,-1,0,-1,1,-1,0,0,-1


In [6]:
labels_df[labels_df.column == 'age']

def label_type(labels):
    def is_num(c, v):
        try:
            c(v)
            return True
        except:
            return False
            
       
    if all([is_num(int,e) for e in labels]):
        return int
    elif all([is_num(float,e) for e in labels]):
        return float
    else:
        return str
    


In [7]:
def isnan(v):
    import math
    try:
        return math.isnan(v)
    except TypeError:
        return False
    
def make_label_dict(g):
    d = { r.code:r.label if not isnan(r.label) else 'NA' for idx, r in g.iterrows() }
    return d

age_df = labels_df[labels_df.column == 'divorce']
d = make_label_dict(age_df)

t = df.copy()

t['divorce'].astype('category').cat.rename_categories(d)


0       no
1       NA
2       NA
3       no
4       NA
        ..
4027    NA
4028    no
4029    no
4030    NA
4031    no
Name: divorce, Length: 4032, dtype: category
Categories (3, object): ['NA', 'yes', 'no']

In [8]:
def convert_to_categorical(df, labels_df):
    import math

    t = df.copy()
    
    for col_name, g in labels_df.groupby('column'):
        d = make_label_dict(g)
        try:
            t[col_name] = t[col_name].astype('category').cat.rename_categories(d)
        except Exception as e:
            print(col_name, d, e)
        
    return t

convert_to_categorical(df, labels_df)

Unnamed: 0,year,id,wrkstat,hrs1,hrs2,evwork,wrkslf,wrkgovt,occ10,prestg10,...,biblenv,postlifenv,kidssolnv,uscitznnv,fucitznnv,fepolnv,scibnftsv,abanyg,fileversion,vietdraft
0,2021,0,working full time,36.0,,,someone else,,receptionists and information clerks,38.0,...,inspired word,no,somewhat worse,,,,about equal (phone mode only: volunteered),no,7221.32,
1,2021,1,working full time,45.0,,,someone else,,advertising and promotions managers,57.0,...,,,,,,,benefits greater,,7221.32,
2,2021,2,working part time,16.0,,,someone else,,miscellaneous assemblers and fabricators,35.0,...,,,,,,disagree,,yes,7221.32,
3,2021,3,working part time,27.0,,,self-employed,,childcare workers,35.0,...,,,,,,,,,7221.32,
4,2021,4,working full time,42.0,,,someone else,,insurance claims and policy processing clerks,38.0,...,inspired word,yes,,a u.s. citizen,,disagree,,yes,7221.32,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4027,2021,4027,in school,,,no,,,,,...,ancient book,yes,,,,disagree,,yes,7221.32,
4028,2021,4028,"with a job, but not at work because of tempora...",,50.0,,someone else,,customer service representatives,31.0,...,,,,,,,,,7221.32,
4029,2021,4029,working full time,45.0,,,someone else,,"managers, all other",39.0,...,,,,,,,benefits greater,,7221.32,
4030,2021,4030,retired,,,yes,someone else,,secondary school teachers,64.0,...,inspired word,yes,,,,disagree,,yes,7221.32,draft lottery number made draft unlikely
