In [1]:
import pandas as pd

In [2]:
# Load csv with no additional arguments
data = pd.read_csv("../dataset/vt_tax_data_2016.csv")

# Print the data types
print(data.dtypes)

STATEFIPS     int64
STATE        object
zipcode       int64
agi_stub      int64
N1            int64
              ...  
A85300        int64
N11901        int64
A11901        int64
N11902        int64
A11902        int64
Length: 147, dtype: object


In [4]:
# Create dict specifying data types for agi_stub and zipcode
data_types = {"agi_stub": "category", "zipcode": "str"}

# Load csv using dtype to set correct data types
data = pd.read_csv("../dataset/vt_tax_data_2016.csv", dtype=data_types)

# Print data types of resulting frame
print(data.dtypes.head())

STATEFIPS       int64
STATE          object
zipcode        object
agi_stub     category
N1              int64
dtype: object


In [10]:
# Create dict specifying that 0s in zipcode are NA values
null_values = {"0", "N/A", "NA"}
print(type(null_values))

# Load csv using na_values keyword argument
data = pd.read_csv("../dataset/vt_tax_data_2016.csv", na_values={'zipcode': null_values})

# View rows with NA ZIP codes
# print(data[data.zipcode.isna()])
print(data)

<class 'set'>
      STATEFIPS STATE  zipcode  agi_stub      N1  mars1  MARS2  MARS4   PREP  \
0            50    VT      NaN         1  111580  85090  14170  10740  45360   
1            50    VT      NaN         2   82760  51960  18820  11310  35600   
2            50    VT      NaN         3   46270  19540  22650   3620  24140   
3            50    VT      NaN         4   30070   5830  22190    960  16060   
4            50    VT      NaN         5   39530   3900  33800    590  22500   
...         ...   ...      ...       ...     ...    ...    ...    ...    ...   
1471         50    VT  99999.0         2    2010   1300    410    240    970   
1472         50    VT  99999.0         3    1070    500    460     90    590   
1473         50    VT  99999.0         4     650    170    450     30    370   
1474         50    VT  99999.0         5     750    120    620     30    470   
1475         50    VT  99999.0         6     180     40    170      0    180   

          N2  ...  N10300

In [11]:
# Create dict specifying that 0s in zipcode are NA values
null_values = {"zipcode": 0}

# Load csv using na_values keyword argument
data = pd.read_csv("../dataset/vt_tax_data_2016.csv", 
                   na_values=null_values)

# View rows with NA ZIP codes
print(data[data.zipcode.isna()])

   STATEFIPS STATE  zipcode  agi_stub      N1  mars1  MARS2  MARS4   PREP  \
0         50    VT      NaN         1  111580  85090  14170  10740  45360   
1         50    VT      NaN         2   82760  51960  18820  11310  35600   
2         50    VT      NaN         3   46270  19540  22650   3620  24140   
3         50    VT      NaN         4   30070   5830  22190    960  16060   
4         50    VT      NaN         5   39530   3900  33800    590  22500   
5         50    VT      NaN         6    9620    600   8150      0   7040   

       N2  ...  N10300  A10300  N85530  A85530  N85300  A85300  N11901  \
0  130630  ...   53660   50699       0       0       0       0   10820   
1  132950  ...   74340  221146       0       0       0       0   12820   
2   91870  ...   44860  266097       0       0       0       0   10810   
3   71610  ...   29580  264678       0       0       0       0    7320   
4  103710  ...   39170  731963      40      24       0       0   12500   
5   26430  ...  

In [15]:
try:
  # Set warn_bad_lines to issue warnings about bad records
  data = pd.read_csv("../dataset/vt_tax_data_2016.csv", 
                     on_bad_lines='warn')
  
  # View first 5 records
  print(data.head())
  
except pd.errors.ParserError:
    print("Your data contained rows that could not be parsed.")

   STATEFIPS STATE  zipcode  agi_stub      N1  mars1  MARS2  MARS4   PREP  \
0         50    VT        0         1  111580  85090  14170  10740  45360   
1         50    VT        0         2   82760  51960  18820  11310  35600   
2         50    VT        0         3   46270  19540  22650   3620  24140   
3         50    VT        0         4   30070   5830  22190    960  16060   
4         50    VT        0         5   39530   3900  33800    590  22500   

       N2  ...  N10300  A10300  N85530  A85530  N85300  A85300  N11901  \
0  130630  ...   53660   50699       0       0       0       0   10820   
1  132950  ...   74340  221146       0       0       0       0   12820   
2   91870  ...   44860  266097       0       0       0       0   10810   
3   71610  ...   29580  264678       0       0       0       0    7320   
4  103710  ...   39170  731963      40      24       0       0   12500   

   A11901  N11902  A11902  
0    9734   88260  138337  
1   20029   68760  151729  
2   2449