In [9]:
import os
import pandas as pd
import numpy as np
from datetime import timedelta

# Function to convert space values to -1
def space_to_int(value):
    if value == ' ':
        return -1
    else:
        return int(value)  

# Path to data file and information to process data file
data_file = 'C:/Users/Lucas/Documents/NDACAN SRI/Data Request/fc_all.csv'
columns_to_use = (0,3,6,7,24,45,46,47,91,95)
data_types = {'RecNumbr':str, 'Entered':np.bool}
convert_dict = {i:space_to_int for i in ['TOTALREM','AgeAtLatRem','CTKFAMST','CTK1YR','CTK2YR'] }

# Load data with reasonable chunksize iterator
chunksize_param = 500000

data_load = pd.read_csv(
    data_file,
    usecols=columns_to_use,
    dtype=data_types,
    converters=convert_dict,
    chunksize=chunksize_param)

data_list = []

for i, data in enumerate(data_load):
    print(str(i*chunksize_param))
    data_list.append(data)

print('Concatenating...')
data_df = pd.concat(data_list)
print('Complete!')

# Rename the first column because of byte-order-mark thing at beginning
data_df.rename(columns={'\ufeffDataYear':'DataYear'},inplace=True)

# Test that it all worked
data_df.head()

0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
5000000
5500000
6000000
6500000
7000000
7500000
8000000
8500000
9000000
9500000
10000000
10500000
11000000
Concatenating...
Complete!


Unnamed: 0,DataYear,St,FIPSCODE,RecNumbr,TOTALREM,CTKFAMST,CTK1YR,CTK2YR,AgeAtLatRem,Entered
0,2000,AL,8,1907,1,5,1963,1963,0,False
1,2000,AL,8,1997,1,2,1959,1959,1,False
2,2000,AL,1073,2014,2,5,1959,1959,27,False
3,2000,AL,8,2074,3,3,1949,1959,5,False
4,2000,AL,8,3214,1,2,1949,1962,0,False


In [10]:
# All under 18 entries, first, reentries, and all entries

entered_LT18 = (data_df['Entered']) & (data_df['AgeAtLatRem'] < 18)
data_df['First'] = (data_df['TOTALREM'] == 1) & entered_LT18
data_df['Reentry'] = (data_df['TOTALREM'] > 1) & entered_LT18
data_df['All'] = (data_df['TOTALREM'] >= 1) & entered_LT18

In [16]:
# Calculate caretaker/parent ages (approximated based on caretaker birth year)

data_df['CTK1_Age'] = data_df['DataYear'] - data_df['CTK1YR']
data_df['CTK2_Age'] = data_df['DataYear'] - data_df['CTK2YR']

def age_missing(value):
    if value < 0 or value > 100:
        return -1
    else:
        return value
    
data_df['CTK1_Age'] = data_df['CTK1_Age'].apply(age_missing)
data_df['CTK2_Age'] = data_df['CTK2_Age'].apply(age_missing)

In [27]:
def age_ranges(value):
    pass

result_df = data_df[data_df['All']]
result_df = result_df['CTK1_Age'].value_counts()
result_df = result_df.sort_index()

dataset_name = 'entries_primaryCaretakerAges'

# Save to csv file
output_name = 'C:/Users/Lucas/Documents/NDACAN SRI/Analysis/' + dataset_name + '.csv'
result_df.to_csv(output_name)

In [None]:
dataset_name = 'entries&first&re_by_year_state'



result_df = data_df.groupby(['DataYear','St'])['First','Reentry','All'].sum()
result_df['Percent_First'] = result_df['First'] / result_df['All']
result_df = result_df.unstack(0)

for field in ['First','Reentry','All','Percent_First']:
    result_df[field,'Diff'] = result_df[field,2014] - result_df[field,2000]

# Save to csv file
output_name = 'C:/Users/Lucas/Documents/NDACAN SRI/Analysis/' + dataset_name + '.csv'
result_df.to_csv(output_name)