In [1]:
import os
import pandas as pd
import numpy as np
import random

n_drive = 'N:\Transfer\KSkvoretz\AHRQ\data\\02_SDoH\\USDA'
os.listdir(n_drive)

['DataDownload.xls', 'README.txt']

In [2]:
df = pd.read_excel(os.path.join(n_drive, 'DataDownload.xls'), sheet_name='Supplemental Data - County')
df = df[df.State == " Colorado"]
df = df[['FIPS ','State','County','Population Estimate, 2016']]
df.columns.values[0] = "FIPS"
df.columns.values[3] = 'population'
df['population'] = pd.to_numeric(df.population.str.replace(',',''))
print(df.shape)
print(df.head())

(64, 4)
     FIPS      State      County  population
244  8001   Colorado      Adams       498187
245  8003   Colorado    Alamosa        16654
246  8005   Colorado   Arapahoe       637068
247  8007   Colorado  Archuleta        12854
248  8009   Colorado       Baca         3568


# Create columns with random data

In [3]:
def create_rand_columns(prefix, number):
    raw_names = range(1,number+1)
    raw_names = [prefix+"_"+str(num) for num in raw_names]
    count = 0
    for col in raw_names:
        # make half of them a number, half of them a percent
        if count <= number/2 and prefix != 'sdoh_score':
            df[col] = np.random.randint(0, np.random.randint(1,1000), df.shape[0])
        else:
            df[col] = np.random.randint(0, 100, df.shape[0])/100
        count += 1

np.random.seed(44133)
create_rand_columns('demographics', 10)
create_rand_columns('sdoh_raw', 10)
create_rand_columns('outcome', 5)
create_rand_columns('sdoh_score', 6)
print(df.head())

     FIPS      State      County  population  demographics_1  demographics_2  \
244  8001   Colorado      Adams       498187              57             382   
245  8003   Colorado    Alamosa        16654             134              39   
246  8005   Colorado   Arapahoe       637068              89             351   
247  8007   Colorado  Archuleta        12854              64             329   
248  8009   Colorado       Baca         3568               6               9   

     demographics_3  demographics_4  demographics_5  demographics_6  ...  \
244              79             780              18              57  ...   
245             427             836             385             112  ...   
246             155             554              48              13  ...   
247             415             746             156              36  ...   
248             330             260             367              32  ...   

     outcome_2  outcome_3  outcome_4  outcome_5  sdoh_score_1 

In [4]:
df.to_csv(os.path.join(n_drive, '..', '..', 'sample_data.csv'), index = False)

# Create fake data dictionary

In [5]:
dictionary = pd.DataFrame({'column_name': df.columns.values})
dictionary['description'] = dictionary.column_name.str.replace('_',' ')
dictionary.loc[dictionary.column_name.str.contains('sdoh_score'),'description'] = ['Economic Stability',
                                                                                  'Neighborhood & Physical Environment',
                                                                                  'Education',
                                                                                  'Food',
                                                                                  'Community & Social Context',
                                                                                  'Health Care System']

In [6]:
def dict_flag(string):
    dictionary[string] = np.where(dictionary['column_name'].str.contains(string), 1, 0)
    
dict_flag('demographic')
dictionary.loc[dictionary.column_name == 'population','demographic'] = 1

dict_flag('sdoh_raw')
dict_flag('outcome')
dict_flag('sdoh_score')

print(dictionary.head(10))
print(dictionary.tail(10))

      column_name     description  demographic  sdoh_raw  outcome  sdoh_score
0            FIPS            FIPS            0         0        0           0
1           State           State            0         0        0           0
2          County          County            0         0        0           0
3      population      population            1         0        0           0
4  demographics_1  demographics 1            1         0        0           0
5  demographics_2  demographics 2            1         0        0           0
6  demographics_3  demographics 3            1         0        0           0
7  demographics_4  demographics 4            1         0        0           0
8  demographics_5  demographics 5            1         0        0           0
9  demographics_6  demographics 6            1         0        0           0
     column_name                          description  demographic  sdoh_raw  \
25     outcome_2                            outcome 2         

In [7]:
dictionary['data_type'] = ''

for ix, row in dictionary.iterrows():
    if ix > 3:
        if df[row['column_name']].between(0,1).all():
            if 'sdoh_score' in row['column_name']:
                data_type = 'aggregate'
            else:
                data_type = 'percentage'
        else:
            data_type = 'continuous'
    else:
        data_type = "ID"
    dictionary.loc[dictionary.column_name == row['column_name'], 'data_type'] = data_type

In [8]:
# Raw data incorporated into SDoH scores

flag_list = [1, 0, 1] + [0] * 7

for i in range(1,7):
    used_colname = 'used_sdoh_score_' + str(i)
    dictionary[used_colname] = 0
    random.shuffle(flag_list)
    dictionary.loc[dictionary.column_name.str.contains('sdoh_raw'), used_colname] = flag_list
    
print(dictionary.loc[dictionary.column_name.str.contains('sdoh_raw')])

    column_name  description  demographic  sdoh_raw  outcome  sdoh_score  \
14   sdoh_raw_1   sdoh raw 1            0         1        0           0   
15   sdoh_raw_2   sdoh raw 2            0         1        0           0   
16   sdoh_raw_3   sdoh raw 3            0         1        0           0   
17   sdoh_raw_4   sdoh raw 4            0         1        0           0   
18   sdoh_raw_5   sdoh raw 5            0         1        0           0   
19   sdoh_raw_6   sdoh raw 6            0         1        0           0   
20   sdoh_raw_7   sdoh raw 7            0         1        0           0   
21   sdoh_raw_8   sdoh raw 8            0         1        0           0   
22   sdoh_raw_9   sdoh raw 9            0         1        0           0   
23  sdoh_raw_10  sdoh raw 10            0         1        0           0   

     data_type  used_sdoh_score_1  used_sdoh_score_2  used_sdoh_score_3  \
14  continuous                  0                  0                  1   
15  continuou

In [9]:
print(dictionary)
dictionary.to_csv(os.path.join(n_drive, '..', '..', 'sample_dictionary.csv'), index = False)

        column_name                          description  demographic  \
0              FIPS                                 FIPS            0   
1             State                                State            0   
2            County                               County            0   
3        population                           population            1   
4    demographics_1                       demographics 1            1   
5    demographics_2                       demographics 2            1   
6    demographics_3                       demographics 3            1   
7    demographics_4                       demographics 4            1   
8    demographics_5                       demographics 5            1   
9    demographics_6                       demographics 6            1   
10   demographics_7                       demographics 7            1   
11   demographics_8                       demographics 8            1   
12   demographics_9                       demograph