In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white')

import utils
from utils import decorate
from thinkstats2 import Pmf, Cdf

## Data Management

In [2]:
df = pd.read_stata("./nsduh_stata/nsduh.DTA")
df.head(3)

Unnamed: 0,QUESTID2,filedate,cigever,cigofrsm,cigwilyr,cigtry,cigyfu,cigmfu,cigrec,CIG30USE,...,POVERTY3,toolong,troubund,PDEN10,COUTYP4,MAIIN102,AIIND102,ANALWT_C,vestr,verep
0,55235143,10/09/2018,1 - Yes,99 - LEGITIMATE SKIP,99 - LEGITIMATE SKIP,13,9999 - LEGITIMATE SKIP,99 - LEGITIMATE SKIP,4 - More than 3 years ago,93 - DID NOT USE CIGARETTES IN THE PAST 30 DAYS,...,3 - Income More Than 2X Fed Pov Thresh,2 - No,2 - No,1 - Segment in a CBSA with 1 million or more p...,1 - Large Metro,2 - Segment not in an American Indian area,2 - Census block not in an American Indian area,11203.888954,40043,1
1,13435143,10/09/2018,1 - Yes,99 - LEGITIMATE SKIP,99 - LEGITIMATE SKIP,15,9999 - LEGITIMATE SKIP,99 - LEGITIMATE SKIP,1 - Within the past 30 days,18,...,3 - Income More Than 2X Fed Pov Thresh,1 - Yes,2 - No,1 - Segment in a CBSA with 1 million or more p...,1 - Large Metro,2 - Segment not in an American Indian area,2 - Census block not in an American Indian area,9496.462244,40006,2
2,81345143,10/09/2018,1 - Yes,99 - LEGITIMATE SKIP,99 - LEGITIMATE SKIP,14,9999 - LEGITIMATE SKIP,99 - LEGITIMATE SKIP,1 - Within the past 30 days,10,...,3 - Income More Than 2X Fed Pov Thresh,2 - No,2 - No,1 - Segment in a CBSA with 1 million or more p...,1 - Large Metro,2 - Segment not in an American Indian area,2 - Census block not in an American Indian area,2943.702802,40030,2


## Variables

#### Heroin
herever (Have you ever, even once, used heroin?)  
herage (How old were you the first time you used heroin?)  
herrec (How long has it been since you last used heroin?)  
HERYRTOT (TOTAL # OF DAYS USED HEROIN IN PAST 12 MONTHS)  

#### Demographics
AGE2 (RECODE - FINAL EDITED AGE WITH DIFFERENT CODES)  
IREDUHIGHST2 (EDUCATION - RECODED IMPUTATION REVISED)  
EDUHIGHCAT Len : 1 RC-EDUCATION CATEGORIES  
WRKSTATWK2 (WORK SITUATION IN PAST WEEK - RECODE)  
IRSEX (1 - male, 2- female)  
IRPINC3  #income bracket  
IRFAMIN3 Len : 1 RECODE - IMP.REVISED - TOT FAM INCOME  

#### Health Insurance
IRMCDCHP Len : 1 IMPUTATION REVISED CAIDCHIP  
IRPRVHLT Len : 1 PRIVATE HEALTH INSURANCE - IMPUTATION REVISED  
IRINSUR4 Len : 1 RC-OVERALL HEALTH INSURANCE - IMPUTATION REVISED  
HLTINNOS (COVERED BY HEALTH INSUR (NOT OTHERWISE SPECIFIED))  
ANYHLTI2 (COVERED BY ANY HEALTH INSURANCE - RECODE)  

#### Pain killer usage
oxycnanyyr #oxycontin misuse in past year  
pnrmainrsn #pain remover reason to use  
oxcnnmyr #oxy not how doctor prescribed in past year  
oxcnnmage #age first abused oxy  
pnrrshook #if used pain relievers bc "hooked"  
pnrrsmain #main readon last pain reliever used (abuse)  
pnranylif #if prescription pain reliever ever used (any reason)  
IRPNRANYREC Len : 1 ANY PAIN RELIEVER RECENCY - IMPUTATION REVISED  
pnrnmrec  

In [3]:
print(list(df.columns))

['QUESTID2', 'filedate', 'cigever', 'cigofrsm', 'cigwilyr', 'cigtry', 'cigyfu', 'cigmfu', 'cigrec', 'CIG30USE', 'CG30EST', 'CIG30AV', 'CIG30BR2', 'CIG30TPE', 'CIG30MEN', 'CIG30MLN', 'CIG30RO2', 'cigdlymo', 'cigage', 'cigdlyfu', 'cigdlmfu', 'CIG100LF', 'smklssevr', 'smklsstry', 'smklssyfu', 'smklssmfu', 'smklssrec', 'SMKLSS30N', 'SMKLSS30E', 'cigarevr', 'cigartry', 'cigaryfu', 'cigarmfu', 'cigarrec', 'CGR30USE', 'CI30EST', 'CGR30BR2', 'pipever', 'PIPE30DY', 'alcever', 'alctry', 'alcyfu', 'alcmfu', 'alcrec', 'alcyrtot', 'altotfg', 'alfqflg', 'albstway', 'aldaypyr', 'aldaypmo', 'aldaypwk', 'alcdays', 'AL30EST', 'aldysfg', 'ALCUS30D', 'ALCBNG30D', 'mjever', 'mjage', 'mjyfu', 'mjmfu', 'mjrec', 'mjyrtot', 'mrtotfg', 'mjfqflg', 'mrbstway', 'mrdaypyr', 'mrdaypmo', 'mrdaypwk', 'MJDAY30A', 'MR30EST', 'cocever', 'cocage', 'cocyfu', 'cocmfu', 'cocrec', 'cocyrtot', 'cctotfg', 'ccfqflg', 'ccbstway', 'ccdaypyr', 'ccdaypmo', 'ccdaypwk', 'COCUS30A', 'CC30EST', 'crkever', 'crkage', 'crkyfu', 'crkmfu', '

In [4]:
relevant_columns = ['filedate', 'herever', 'herage', 'herrec', 'heryrtot', 'AGE2', 'IREDUHIGHST2', 'eduhighcat', 'WRKSTATWK2', 'irsex', 'irmcdchp', 'irprvhlt', 'IRINSUR4', 'hltinnos', 'ANYHLTI2', 'IRPINC3', 'IRFAMIN3', 'oxycnanyyr', 'pnrmainrsn','oxcnnmyr', 'oxcnnmage', 'iroxcnanyyr', 'iroxcnnmyr', 'pnrrshook', 'pnrrsmain', 'pnranylif', 'irpnranyrec']

In [5]:
df1 = pd.DataFrame(df, columns=relevant_columns)

In [6]:
df1["SEX"] = df.irsex.astype("category")
df1["SEX"].cat.categories = ("Male","Female")
df1["AGE"] = df.AGE2.astype("category")
df1["AGE"].cat.categories = ("12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22-23", "24-25", "26-29", "30-34", "35-49", "50-64", "65+")
#PRES = PRESCRIPTION
df1["PRES_PAINKILLER_USE"] = df.irpnranyrec.astype("category")
df1["PRES_PAINKILLER_USE"].cat.categories = ("Used within year", "Used in 12+ months", "Never used")
df1["PRES_PAINKILLER_MISUSE"] = df.irpnranyrec.astype("category")
df1["PRES_PAINKILLER_MISUSE"].cat.categories = ("Used within year", "Used in 12+ months", "Never used")
#REC = RECENCY
df1["REC_PAINKILLER_MISUSE"] = df.irpnrnmrec.astype("category")
df1["REC_PAINKILLER_MISUSE"].cat.categories = ("Misused in last 30 days", "Misused 31 days-12 months", "Misused before 12+ months", "Never misused")
df1["REC_PAINKILLER_USE"] = df.pnrnmrec.astype("category")
df1["REC_PAINKILLER_USE"].cat.categories = ("Within 30 days", "31 days-12 months", "Within more than 12 months", "Within 12 months", "Within lifetime", "Within 30 days (log assn)","Unknown", "Never", "Blank")
df1["ANY_PAINKILLER_LIFETIME"] = df.pnranylif.astype("category")
df1["ANY_PAINKILLER_LIFETIME"].cat.categories = ("Used", "Not used", "Used (assumed)", "Don't Know", "Refused", "Blank")
df1["OXYCONTIN_USE"] = df.iroxcnanyyr.astype("category")
df1["OXYCONTIN_USE"].cat.categories = ("Used in past year", "Not used in past year")
df1["OXYCONTIN_MISUSED"] = df.iroxcnnmyr.astype("category")
df1["OXYCONTIN_MISUSED"].cat.categories = ("Misused - past year", "Did not misuse - past year")
df1["INCOME"] = df.IRPINC3.astype("category")
df1["INCOME"].cat.categories = ("10k or less", "10-20k", "20-30k", "30-40k", "40-50k", "50-75k", "75k+")
df1["FAMILY_INCOME"] = df.IRFAMIN3.astype("category")
df1["FAMILY_INCOME"].cat.categories = ("10k or less", "10-20k", "20-30k", "30-40k", "40-50k", "50-75k", "75k+")
df1["EDUCATION"] = df.eduhighcat.astype("category")
df1["EDUCATION"].cat.categories = ("Less High School", "High School Grad", "Some Coll/Assoc Degree", "College Graduate", "12-17 year olds")
df1["CAIDCHIP_HI"] = df.irmcdchp.astype("category")
df1["CAIDCHIP_HI"].cat.categories = ("Has Medicaid/CHIP", "Doesn't have Medicaid/CHIP")
df1["PRIVATE_HI"] = df.irprvhlt.astype("category")
df1["PRIVATE_HI"].cat.categories = ("Has Private HI", "Doesn't have Private HI")
df1["ANY_HI"] = df.IRINSUR4.astype("category")
df1["ANY_HI"].cat.categories = ("With HI", "Without HI")

In [7]:
relevant_columns = ["SEX", "AGE", "PRES_PAINKILLER_USE", "REC_PAINKILLER_USE", "REC_PAINKILLER_MISUSE", "PRES_PAINKILLER_MISUSE", "ANY_PAINKILLER_LIFETIME", "OXYCONTIN_USE", "OXYCONTIN_MISUSED", "INCOME", "FAMILY_INCOME", "EDUCATION", "CAIDCHIP_HI", "PRIVATE_HI", "ANY_HI"]

In [8]:
df2 = pd.DataFrame(df1, columns=relevant_columns)

In [9]:
def has_hi(row):
    if row["CAIDCHIP_HI"] == "Has Medicaid/CHIP":
        val = "Has Medicaid/CHIP"
    elif row["PRIVATE_HI"] == "Has Private HI":
        val = "Has Private HI"
    elif row["ANY_HI"] == "With HI":
        val = "Has any other HI"
    elif row["ANY_HI"] == "Without HI":
        val = "Without any HI"
    else:
        val = "wtf"
    return val

df2["HEALTH_INSURANCE"] = df2.apply(has_hi, axis=1)

In [10]:
df2.to_pickle('cleaned_nsduh.pkl')

In [11]:
df3 = pd.read_pickle('cleaned_nsduh.pkl')
df3.head(10)

Unnamed: 0,SEX,AGE,PRES_PAINKILLER_USE,REC_PAINKILLER_USE,REC_PAINKILLER_MISUSE,PRES_PAINKILLER_MISUSE,ANY_PAINKILLER_LIFETIME,OXYCONTIN_USE,OXYCONTIN_MISUSED,INCOME,FAMILY_INCOME,EDUCATION,CAIDCHIP_HI,PRIVATE_HI,ANY_HI,HEALTH_INSURANCE
0,Male,65+,Never used,Never,Never misused,Never used,Not used,Not used in past year,Did not misuse - past year,75k+,75k+,College Graduate,Doesn't have Medicaid/CHIP,Has Private HI,With HI,Has Private HI
1,Male,35-49,Used in 12+ months,Never,Never misused,Used in 12+ months,Used,Not used in past year,Did not misuse - past year,30-40k,30-40k,College Graduate,Has Medicaid/CHIP,Doesn't have Private HI,With HI,Has Medicaid/CHIP
2,Male,35-49,Used in 12+ months,Never,Never misused,Used in 12+ months,Used,Not used in past year,Did not misuse - past year,50-75k,75k+,Some Coll/Assoc Degree,Doesn't have Medicaid/CHIP,Has Private HI,With HI,Has Private HI
3,Male,35-49,Used in 12+ months,Never,Never misused,Used in 12+ months,Used,Not used in past year,Did not misuse - past year,75k+,75k+,College Graduate,Doesn't have Medicaid/CHIP,Doesn't have Private HI,With HI,Has any other HI
4,Female,65+,Never used,Never,Never misused,Never used,Not used,Not used in past year,Did not misuse - past year,30-40k,40-50k,High School Grad,Doesn't have Medicaid/CHIP,Doesn't have Private HI,With HI,Has any other HI
5,Female,65+,Never used,Never,Never misused,Never used,Not used,Not used in past year,Did not misuse - past year,10k or less,10-20k,Less High School,Has Medicaid/CHIP,Has Private HI,With HI,Has Medicaid/CHIP
6,Female,19,Never used,Never,Never misused,Never used,Not used,Not used in past year,Did not misuse - past year,10k or less,75k+,High School Grad,Doesn't have Medicaid/CHIP,Has Private HI,With HI,Has Private HI
7,Female,22-23,Never used,Unknown,Never misused,Never used,Don't Know,Not used in past year,Did not misuse - past year,10k or less,30-40k,College Graduate,Doesn't have Medicaid/CHIP,Doesn't have Private HI,Without HI,Without any HI
8,Male,19,Used in 12+ months,Never,Never misused,Used in 12+ months,Used,Not used in past year,Did not misuse - past year,10k or less,10k or less,Less High School,Doesn't have Medicaid/CHIP,Doesn't have Private HI,Without HI,Without any HI
9,Male,12,Never used,Never,Never misused,Never used,Not used,Not used in past year,Did not misuse - past year,10k or less,10k or less,12-17 year olds,Has Medicaid/CHIP,Doesn't have Private HI,With HI,Has Medicaid/CHIP
