In [6]:
import pandas as pd
import numpy as np

census_file = 'census_starter.csv'
mbd_file = 'mbd_train.csv'

census = pd.read_csv(census_file)
mbd = pd.read_csv(mbd_file)


In [8]:

mbd['first_day_of_month'] = pd.to_datetime(mbd['first_day_of_month']) 
# rename to make it easier to type faster
mbd = mbd.rename(columns={'microbusiness_density':'mbd'})

# get the population from the count of active MB and the MBD
mbd['pop'] = mbd['active'] / mbd['mbd'] * 100

mbd = mbd.sort_values(by=['cfips','first_day_of_month'], ascending=[True,True])

# create a rank of dates for each county
mbd['date_order'] = mbd.groupby(['cfips']).rank()['first_day_of_month'] - 1
# create the same rank but descending
mbd['date_order_desc'] = mbd.groupby(['cfips']).rank(ascending=False)['first_day_of_month'] - 1


mbd['delta'] = mbd['mbd'].diff()
# set the values of delta equal to nan when it isa new county - which occurs at date_order = 0
mbd.loc[mbd[mbd['date_order'] == 0].index, 'delta'] = np.nan



In [4]:
# this will only get the mbd for each cfips
# get the cfips
cfips = pd.unique(mbd['cfips'])
# use 20 months of training to find the next months data
train_number_months = 20
test_number_months = 1
total_number_months = train_number_months + test_number_months

# get a count of how many records we will create per cfip
iterations_per_cfip = int(max(mbd['date_order']) - total_number_months)

# pivot the entire thing
mbd_pivotted = mbd.pivot(index='cfips',columns='date_order',values='mbd')

# get the counties to loop thru
cfips_to_loop = cfips

# just a list of possible starting months 
iters = list(range(0, int(iterations_per_cfip)))

# create a list which has the cfips to be an index when we go to csv
corresponding_cfips = [num for num in cfips_to_loop for i in range(len(iters))]
# corresponding_starting_month has 0 = min(date_index) = min(mbd['first_day_of_month']) = Timestamp('2019-08-01 00:00:00')
corresponding_starting_month = [i for num in cfips_to_loop for i in range(len(iters))]
# init an empty array
mbd_exploded_np = np.empty((len(cfips_to_loop)*len(iters), total_number_months))
# this is the row iterator for the numpy array
j = 0
# loop thru each cfip
for cfip in cfips_to_loop:

    for i in iters:
        # get the columns we need
        columns_to_grab = list(range(i,i+total_number_months))
        # grab those columns from the pivotted df
        to_append = mbd_pivotted.loc[[cfip],columns_to_grab]
        # send it to a scalar array
        to_append_np = to_append.to_numpy()
        # chagne the big array's row to that row in question
        mbd_exploded_np[j,:] = to_append_np

        # iterate plus one
        j += 1

to_csv_matrix = np.insert(mbd_exploded_np, 0, corresponding_cfips, axis=1)
to_csv_matrix = np.insert(to_csv_matrix, 1, corresponding_starting_month, axis=1)
csv_headers = ['CFIP','FirstDateOrder'] + ['MBD_' + str(i) for i in range(total_number_months)]
np.savetxt("MBD_CFIP_20monthLag.csv", to_csv_matrix, delimiter=",", header=",".join(csv_headers), comments="")


In [9]:
# this will create 2 12-month rows plus the census data for those years for each CFIP

# get the cfips
cfips = pd.unique(mbd['cfips'])
# get the month
mbd['month_num'] = mbd['first_day_of_month'].dt.month
mbd['year'] = mbd['first_day_of_month'].dt.year.astype(int)
# just doa  groupby to get the combinations of the 2 values - count doesnt matter
date_order_monthy_num = mbd.groupby(['date_order', 'month_num','year']).count()
# Create an empty dictionary
month_num_to_date_index = {}
year_to_date_index = {}
# Iterate over the MultiIndex and populate the dictionary
for index in date_order_monthy_num.index:
    date_order = int(index[0])  # Get the value from the 'date_order' level
    month_num = index[1]  # Get the value from the 'month_num' level
    month_num_to_date_index[date_order] = month_num
    year_num = int(index[2])
    year_to_date_index[date_order] = year_num
# only want to get the date_order values when we are starting on January, where 
starting_indexes_to_grab = [date_order for date_order, month_num in month_num_to_date_index.items() if month_num == 1]

# lots of stuff to get the census dataframe pivotted nicely so that we can join it to the mbd dataframe
census_pivoted = census.melt(id_vars='cfips', var_name='year_column', value_name='value')
census_pivoted['year'] = census_pivoted['year_column'].str[-4:].astype(int)
census_pivoted['attribute'] = census_pivoted['year_column'].str.split('_', expand=True)[1]
census_pivoted.drop(columns=['year_column'], inplace=True)
# Pivot the 'census_pivoted' dataframe to create separate columns for each attribute
census_pivoted = census_pivoted.pivot(index=['cfips', 'year'], columns='attribute', values='value').reset_index()
census_pivoted = census_pivoted.rename(columns={'hh':'medianIncome'})
census_columns_to_add = ['bb','college','foreign','medianIncome','it']

train_number_months = 12
test_number_months = 1
total_number_months = train_number_months + test_number_months

# pivot the entire thing
mbd_pivotted = mbd.pivot(index='cfips',columns='date_order',values='mbd')
# get the counties to loop thru
cfips_to_loop = cfips

# just a list of possible starting months 
iters = starting_indexes_to_grab[:-1]

# create a list which has the cfips to be an index when we go to csv
corresponding_cfips = [num for num in cfips_to_loop for i in iters]
# corresponding_starting_month has 0 = min(date_index) = min(mbd['first_day_of_month']) = Timestamp('2019-08-01 00:00:00')
corresponding_starting_month = [i for num in cfips_to_loop for i in iters]
# init an empty array
mbd_census_exploded = np.empty((len(cfips_to_loop)*len(iters), total_number_months + len(census_columns_to_add)))
# this is the row iterator for the numpy array
j = 0
# loop thru each cfip
for cfip in cfips_to_loop:

    # for each cfip, get a random set of consecutive months to train on
    # random_i = random.sample(iters, samples_per_cfip)
    # get the cfip from census here 
    census_cfip = census_pivoted[census_pivoted['cfips'] == cfip]
    for i in iters:
        year = year_to_date_index[i]
        # get the columns we need
        columns_to_grab = list(range(i,i+total_number_months))
        # grab those columns from the pivotted df
        to_append = mbd_pivotted.loc[[cfip],columns_to_grab]
        # send it to a scalar array
        to_append_np = to_append.to_numpy()
        
        census_to_append = census_cfip[census_cfip['year'] == year].loc[:,census_columns_to_add]
        
        # add the census data to the end 
        to_append_np = np.append(to_append_np, census_to_append.to_numpy())
        # chagne the big array's row to that row in question
        mbd_census_exploded[j,:] = to_append_np

        # iterate plus one
        j += 1

to_csv_matrix = np.insert(mbd_census_exploded, 0, corresponding_cfips, axis=1)
to_csv_matrix = np.insert(to_csv_matrix, 1, corresponding_starting_month, axis=1)
csv_headers = ['CFIP','FirstDateOrder'] + ['MBD_' + str(i) for i in range(total_number_months)] + census_columns_to_add
np.savetxt("MBD_YearLag_plusCensus.csv", to_csv_matrix, delimiter=",", header=",".join(csv_headers), comments="")
