## Goal

The Goal of this notebook is to parse a pdf of a data dictionary for the How Couples Meet and Stay Together Survey. 

The author of the survey was unable to provide a cleaner version of the data dictionary which I need to do analysis on the dataset. 

I will try to map out the question short name with the question asked. 

In [1]:
import PyPDF2
import os 
import sys
import re
import requests

## Download File and Import 

In [2]:
pdf_link = 'https://stacks.stanford.edu/file/druid:vt073cc9067/HCMST_2017_fresh_Codeboodk_v1.1a.pdf'

In [3]:
resp_file = requests.get(pdf_link)
pdf_data = resp_file.content

In [4]:
file_loc = '../docs/HCMST_2017_fresh_Codeboodk_v1.1a.pdf'

In [5]:
with open(file_loc, 'wb') as hnd:
    hnd.write(pdf_data)

In [6]:
hnd = open(file_loc, 'rb')
pdfReader = PyPDF2.PdfFileReader(hnd)

## Working with File and get Text

In [7]:
pdfReader.numPages

121

In [8]:
pg1 = pdfReader.getPage(0)

In [9]:
pg1.extractText().split('\n')

['',
 'HCMST 2017 fresh sample, public data Version 1.1: February 2, 2019 Data Codebook.    . describe *, fullnames  ',
 '              storage   display    value ',
 'variable name   type    format     label      variable label ',
 '------------------------------------------------------------------------------- ',
 'CaseID          int     %8.0g                 Case ID ',
 'CASEID_NEW      long    %12.0g                Longitudnal CaseID ',
 'qflag           byte    %8.0g      QFLAG      DOV: Qualification Flag ',
 'weight1         double  %12.0g                Post-Stratification weight for ',
 '                                                Genpop (n=2994) ',
 'weight1_freqwt  float   %9.0g                 wgt to CPS adult pop, scaled down ',
 '                                                by                                                 2994/3110=round(weight1*2436295                                                 > 95/3110) ',
 'weight2         double  %12.0g                

In [10]:
all_pages = []
for pg_num in range(11):
    temp_pg = pdfReader.getPage(pg_num)
    all_pages.extend(temp_pg.extractText().split('\n'))

In [11]:
len(all_pages)

575

In [12]:
# Remove header/ intro 
all_pages = all_pages[5:]

In [13]:
# Remove no space page separators 
all_pages = [x for x in all_pages if x != '']

In [14]:
len(all_pages)

560

In [15]:
all_pages

['CaseID          int     %8.0g                 Case ID ',
 'CASEID_NEW      long    %12.0g                Longitudnal CaseID ',
 'qflag           byte    %8.0g      QFLAG      DOV: Qualification Flag ',
 'weight1         double  %12.0g                Post-Stratification weight for ',
 '                                                Genpop (n=2994) ',
 'weight1_freqwt  float   %9.0g                 wgt to CPS adult pop, scaled down ',
 '                                                by                                                 2994/3110=round(weight1*2436295                                                 > 95/3110) ',
 'weight2         double  %12.0g                Post-Stratification weight for ',
 '                                                LGB (n=551) ',
 'weight1a        double  %12.0g                Post-Stratification weight for ',
 '                                                total consented Genpop ',
 '                                                respondent

In [16]:
from collections import OrderedDict, Counter

## Begin Parsing Text

In [17]:
def define_new_item(line):
    line_parts = line.split(' ')
    key = line_parts[0]
    definition = line[46:]
    return({key:definition})

In [18]:
def find_phrase(sentence, term = 'byte'):
    words = sentence.split(' ')
    terms = [x for x in words if term in x]
    return(terms)

In [19]:
items = OrderedDict()
for x in all_pages:
    if x[0] != ' ':
        # New definition
        new_item = define_new_item(x)
        items.update(new_item)
    
    last_key = next(reversed(items))
    last_item = items[last_key]
    if x[0] == ' ':
        # Additional definition on next line
        # should be added to last item 
        continued_text = re.sub('\s\s+', '', x[46:])
        continued_text = re.sub('\t', '', continued_text)

        items[last_key] = last_item + continued_text
    
    last_item = items[last_key]
    if 'byte' in last_item:
        term_type = 'byte'
#     elif 'float' in last_item:
#         term_type = 'float'
#     elif 'int%' in last_item:
#         term_type = 'int'
    else:
        term_type = ''

    if term_type != '':
        # Combined additional definition with new line 
        byte_term = find_phrase(last_item, term=term_type)
#         if (byte_term in ('', 'byte', 'float', 'int')) or (len(byte_term) != 1):
        if (byte_term in ('', 'byte')) or (len(byte_term) != 1):

            pass

        byte_term_location = last_item.find(byte_term[0])
        items[last_key] = last_item[:byte_term_location]

         # New item manually split and add to dictionary 
        new_term = last_item[byte_term_location:]
        new_term_parts = re.split(term_type+'.*\d', new_term)
        new_item = items[new_term_parts[0]] = new_term_parts[1]
        

## Begin Idenitfying missed cases

In [20]:
pct_in_str = [x for x in items.values() if '%' in x]

In [21]:
len(items)

260

In [22]:
len(pct_in_str)

7

In [23]:
items

OrderedDict([('CaseID', 'Case ID '),
             ('CASEID_NEW', 'Longitudnal CaseID '),
             ('qflag', 'DOV: Qualification Flag '),
             ('weight1', 'Post-Stratification weight for Genpop (n=2994) '),
             ('weight1_freqwt',
              'wgt to CPS adult pop, scaled down by2994/3110=round(weight1*2436295> 95/3110) '),
             ('weight2', 'Post-Stratification weight for LGB (n=551) '),
             ('weight1a',
              'Post-Stratification weight for total consented Genpop respondents (n=3110) '),
             ('weight1a_freqwt',
              'weighted up to CPS adult population, =round(weight1a*243629595/3110) '),
             ('weight_combo',
              'weight that combines all LGB subjects weighted down, with gen pop weight_combo_freqwtfloat%9.0gfrequency weight version of weight_combo '),
             ('duration', 'Interview duration in minutes '),
             ('speed_flag',
              'Respondents who completed survey in under 2 minute

In [24]:
pct_in_str

['weight that combines all LGB subjects weighted down, with gen pop weight_combo_freqwtfloat%9.0gfrequency weight version of weight_combo ',
 'whether month first met has been allocated w6_q21b_yearint%8.0gyear subject began romanticrelationship w partner ',
 '=1 if month of first cohab israndomly allocated w6_q21d_yearint%8.0gyear subject married partner ',
 'num of people met through phone apps last year, from w6_how_many_app_all, w6_oth weight_combo_v2 float%9.0ga different ver of theweight_combo var that weights LGB a little too highly ',
 ' %9.0g                 = w6_q21b_year+((                                                 w6_q21b_month-0.5)/12) ',
 "%8.0g      yesno      Respondent's residential                                                 neighbor: intermediary or Partner ",
 "RECODE of w6_q11 (partner's mother's Education) subject_mother_yrsedfloat%9.0gRECODE of w6_q14 (Subject's mother's educational attainment) "]

In [25]:
items['interracial_5cat']

'based on w6_subject_race and w6_q6b '

## Manual Updates  

Have to update the entry before and after the ones with errors  
Looks like the errors are from 2 line entries where the first line of the defintion is on the line after  
After another inspection, added more manual updates

In [26]:
manual_update = {}
manual_update["weight_combo"] = "weight that combines all LGB subjects weighted down, with gen pop"
manual_update["weight_combo_freqwt"] = "frequency weight version of weight_combo"
manual_update["w6_q21a_month_flag"] = "whether month first met has been allocated"
manual_update["w6_q21b_year"] = "year subject began romantic relationship w partner"
manual_update["w6_q21c_month_flag"] = "=1 if month of first cohab is randomly allocated"
manual_update["w6_q21d_year"] = "year subject married partner"
manual_update["w6_otherdate_app_all"] = "not incl current partner (if partnered) did you use phone app last year to meet"
manual_update["w6_how_many_app_all"] = "how many ppl did you meet (not including partner) last year through phone apps?"
manual_update["year_fraction_relstart"] = "= w6_q21b_year+(( w6_q21b_month-0.5)/12)"
manual_update["hcm2017q24_R_neighbor"] = "Respondent's residential neighbor: intermediary or Partner"
manual_update["partner_mother_yrsed"] = "RECODE of w6_q11 (partner's mother's Education)"
manual_update["subject_mother_yrsed"] = "RECODE of w6_q14 (Subject's mother's educational"

In [27]:
updates2 = {}
updates2["PPREG4"] = 'Region 4 - Based on State of Residence'
updates2["Past_Partner_Q1"] = "Were you ever married to [Partner name]?"
updates2["Q27"] = "Did you and [Partner name] grow up in the same city or town?"
updates2["hcm2017q24_R_cowork"] = "Respondent's coworker: indermediary or partner"
updates2["hcm2017q24_R_family"] = "Respondent's family: intermediary"
updates2["hcm2017q24_btwn_I_neighbor"] = "intermediaries are neighbors"
updates2["hcm2017q24_internet_game"] = "met through online gaming"
updates2["hcm2017q24_met_through_as_nghbrs"] = "=1 if R_neighbor or P_neighbor=1"
updates2["hcm2017q24_public"] = "met in public place"
updates2["ppagecat"] = 'Age - 7 Categories'
updates2["ppagect4"] = "Age - 4 Categories"
updates2["race15"] = "Q14: Some other race"
updates2["race3"] = "Q14: American Indian or Alaska Native"
updates2["w6_attraction"] = "gender of attraction, based on Q17C and Q17D, and ppgender"
updates2["w6_breakup_nonmar"] = "Between you and [Partner name], who wanted more to break up?"
updates2["w6_friend_connect_1_all"] = "subject knew partner's friends before meeting partner"
updates2["w6_friend_connect_3"] = "[One (or more) of my friends knew one (or more) of [Partner name]'s friends befo"
updates2["w6_how_many_app_2"] = "Of the [Selected response from w6_how_many] you met in the past year, how many d"
updates2["w6_q11"] = "partner's mother's Education"
updates2["w6_q12"] = "partner political affiliation"
updates2["w6_same_sex_couple_gender"] = "based on w6_same_sex_couple and ppgender"
updates2["w6_took_the_survey"] = "whether subject took the survey, or was excluded"
updates2["weight_combo_v2"] = "a different ver of the weight_combo var that weights LGB a little too highly"

In [28]:
items.update(manual_update)
items.update(updates2)
del(items[''])
del(items['-------------------------------------------------------------------------------'])

In [29]:
len(items)

285

## Compare missing from Dataset  

Will need to download data from website and unzip to open file  
Will need to consent to data usage terms  
https://stacks.stanford.edu/file/druid:hg921sg6829/HCMST_2017_public_data_v1.1_stata.zip

In [30]:
import pandas as pd

In [31]:
data = pd.read_stata('../data/HCMST 2017 fresh sample for public sharing draft v1.1.dta')

In [32]:
data.shape

(3510, 285)

In [33]:
data_dict = pd.Series(items)

In [34]:
data_dict.head()

CaseID                                                     Case ID 
CASEID_NEW                                      Longitudnal CaseID 
qflag                                      DOV: Qualification Flag 
weight1             Post-Stratification weight for Genpop (n=2994) 
weight1_freqwt    wgt to CPS adult pop, scaled down by2994/3110=...
dtype: object

In [35]:
# Differences in data dict and data file
set(data.columns).symmetric_difference(set(data_dict.index))

set()

In [36]:
data_dict.to_csv('../data/final_data_dict.csv')