-
Notifications
You must be signed in to change notification settings - Fork 41
/
pyPNAD.py
90 lines (68 loc) · 2.65 KB
/
pyPNAD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
"""
pyPNAD
October 2017 release
The purpose of this code is to import PNAD and PNADC microdata,
released by the Brazilian Office of Statistics (IBGE), into a
pandas DataFrame in a simple and straightforward fashion.
This code was originally written by Lincoln de Sousa.
The original code can be found on https://github.com/clarete/pnad
It was then simplified and updated by Carlos Góes in October 2017.
The procedure is quite simple. The load() function requires two
parameters. You can call it by using the following steps:
pyPNAD.load(data_file, input_file)
* data_file is a the raw text file that holds the microdata for
every PNAD and PNAD.
* input_file is a SAS variable dictionary, which is a companion file to
the microdata and contains variable names, text positions, and lenghts.
"""
import io
import pandas as pd
class pyPNAD:
# Parse through dictionary line,
# return name, position, size, and label
def get_var(line):
# Read
position, rest = line.split(' ', 1)
variable, rest = rest.strip().split(' ', 1)
size, rest = rest.strip().split(' ', 1)
comment = rest.replace('/*', '').replace('*/', '').strip()
# Convert
position = int(position.replace('@', ''))
variable = variable.strip()
size = int(float(size.replace('$', '')))
return {
'name': variable,
'position': position,
'size': size,
'comment': comment,
}
# Parse through dictionary line,
# return a colection of names, positions, sizes, and labels
def get_vars(varsfile):
variables = []
for line in varsfile:
if line[0] is '@':
variable = pyPNAD.get_var(line)
variables.append(variable)
else:
pass
return variables
# Parse through all variables in PNAD,
# return column names and widths
def col_widths(vars_file):
vars_fp = io.open(vars_file)
variables = pyPNAD.get_vars(vars_fp)
columns = [var['name'] for var in variables]
widths = [var['size'] for var in variables]
return columns, widths
# Loads all input and source files,
# returns a pandas DataFrame
def load(data_file, input_file):
columns, widths = pyPNAD.col_widths(input_file)
df = pd.read_fwf(data_file, widths=widths, header=None, names=columns)
return df
def __init__(self):
self.release = 'October 2017'
self.version = '2.0'
self.author = 'Lincoln de Souza & Carlos Góes'