In [1]:
# This imports the Metadata
# I titled the Metadata file as "Official-Metadata"
import pandas
df = pandas.read_csv('Official-Metadata.csv',dtype=object)
import numpy as np  # Import numpy for NaN representation
df = df.replace(np.nan, '')

df

Unnamed: 0,GISAID_Name,ZipCode,CountyFIPS,CensusTract,CensusBlock
0,hCoV-19/USA/WI-CDC-MMB09489547/2021,54902,55139,1100,1001
1,hCoV-19/USA/WI-CDC-MMB09621961/2021,53703,55025,1603,2000
2,hCoV-19/USA/WI-CDC-MMB11606910/2021,54476,55073,900,2013
3,hCoV-19/USA/WI-CDC-MMB11840790/2021,53511,55105,2500,2002
4,hCoV-19/USA/WI-CDC-MMB11841081/2021,53511,55105,2500,2002
...,...,...,...,...,...
22521,hCoV-19/USA/WI-WSLH-222727/2021,53511,55105,1700,3016
22522,hCoV-19/USA/WI-WSLH-222728/2021,54983,55135,100700,3123
22523,hCoV-19/USA/WI-WSLH-222729/2021,54914,55087,10900,2004
22524,hCoV-19/USA/WI-WSLH-222730/2021,54956,55139,2300,2023


In [2]:
# This creates a new column of assigned or unassigned USDA urban/rural classifications
# This also provides clarification for the blank values
def assigned_or_unassigned(row):
    
    # This is testing the following: if CountyFIPS is Nan; if not, there is a confirmed rural/urban classification
    if pandas.notna(row['CountyFIPS']) and any(char.isdigit() for char in str(row['CountyFIPS'])):
        print('Assigned')
        return 'Assigned'
    else:
        print('Unassigned')
        return 'Unassigned'

df['Assigned-or-Unassigned-USDA-Classification'] = df.apply(assigned_or_unassigned, axis=1)

df

Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Unassigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Unassigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Unassigned
Unassigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
Assigned
As

Unnamed: 0,GISAID_Name,ZipCode,CountyFIPS,CensusTract,CensusBlock,Assigned-or-Unassigned-USDA-Classification
0,hCoV-19/USA/WI-CDC-MMB09489547/2021,54902,55139,1100,1001,Assigned
1,hCoV-19/USA/WI-CDC-MMB09621961/2021,53703,55025,1603,2000,Assigned
2,hCoV-19/USA/WI-CDC-MMB11606910/2021,54476,55073,900,2013,Assigned
3,hCoV-19/USA/WI-CDC-MMB11840790/2021,53511,55105,2500,2002,Assigned
4,hCoV-19/USA/WI-CDC-MMB11841081/2021,53511,55105,2500,2002,Assigned
...,...,...,...,...,...,...
22521,hCoV-19/USA/WI-WSLH-222727/2021,53511,55105,1700,3016,Assigned
22522,hCoV-19/USA/WI-WSLH-222728/2021,54983,55135,100700,3123,Assigned
22523,hCoV-19/USA/WI-WSLH-222729/2021,54914,55087,10900,2004,Assigned
22524,hCoV-19/USA/WI-WSLH-222730/2021,54956,55139,2300,2023,Assigned


In [3]:
# This adds zeros to CensusTract values to make them all six digits long
# It also prevents NaN values from being shown
def add_zeros(value):
    if pandas.notna(value):
        return str(value).zfill(6)
    else:
        return ''

# This applies the function to all columns in the DataFrame
df['CensusTract'] = df['CensusTract'].apply(add_zeros)

df

Unnamed: 0,GISAID_Name,ZipCode,CountyFIPS,CensusTract,CensusBlock,Assigned-or-Unassigned-USDA-Classification
0,hCoV-19/USA/WI-CDC-MMB09489547/2021,54902,55139,001100,1001,Assigned
1,hCoV-19/USA/WI-CDC-MMB09621961/2021,53703,55025,001603,2000,Assigned
2,hCoV-19/USA/WI-CDC-MMB11606910/2021,54476,55073,000900,2013,Assigned
3,hCoV-19/USA/WI-CDC-MMB11840790/2021,53511,55105,002500,2002,Assigned
4,hCoV-19/USA/WI-CDC-MMB11841081/2021,53511,55105,002500,2002,Assigned
...,...,...,...,...,...,...
22521,hCoV-19/USA/WI-WSLH-222727/2021,53511,55105,001700,3016,Assigned
22522,hCoV-19/USA/WI-WSLH-222728/2021,54983,55135,100700,3123,Assigned
22523,hCoV-19/USA/WI-WSLH-222729/2021,54914,55087,010900,2004,Assigned
22524,hCoV-19/USA/WI-WSLH-222730/2021,54956,55139,002300,2023,Assigned


In [4]:
# This defines a custom function to combine columns
def combine_columns(row):
    return str(row['CountyFIPS']) + row['CensusTract']

# Apply the custom function to create a new column
df['State-County-TractFIPSCode'] = df.apply(combine_columns, axis=1)

df

Unnamed: 0,GISAID_Name,ZipCode,CountyFIPS,CensusTract,CensusBlock,Assigned-or-Unassigned-USDA-Classification,State-County-TractFIPSCode
0,hCoV-19/USA/WI-CDC-MMB09489547/2021,54902,55139,001100,1001,Assigned,55139001100
1,hCoV-19/USA/WI-CDC-MMB09621961/2021,53703,55025,001603,2000,Assigned,55025001603
2,hCoV-19/USA/WI-CDC-MMB11606910/2021,54476,55073,000900,2013,Assigned,55073000900
3,hCoV-19/USA/WI-CDC-MMB11840790/2021,53511,55105,002500,2002,Assigned,55105002500
4,hCoV-19/USA/WI-CDC-MMB11841081/2021,53511,55105,002500,2002,Assigned,55105002500
...,...,...,...,...,...,...,...
22521,hCoV-19/USA/WI-WSLH-222727/2021,53511,55105,001700,3016,Assigned,55105001700
22522,hCoV-19/USA/WI-WSLH-222728/2021,54983,55135,100700,3123,Assigned,55135100700
22523,hCoV-19/USA/WI-WSLH-222729/2021,54914,55087,010900,2004,Assigned,55087010900
22524,hCoV-19/USA/WI-WSLH-222730/2021,54956,55139,002300,2023,Assigned,55139002300


In [5]:
# Here we read in the file from the USDA classification
# I titled the RUCA file as "RUCA-Definitions"
ruca = pandas.read_csv('WI-RUCA-Definitions.csv',dtype=object)

ruca

Unnamed: 0,State-County FIPS Code,Select State,Select County,Total-FIPS-Code,Primary RUCA Code 2010,"Secondary RUCA Code, 2010 (see errata)","Tract Population, 2010","Land Area (square miles), 2010","Population Density (per square mile), 2010",Urban-or-Rural-USDA-Classification
0,55001,WI,Adams County,55001950201,10,10.0,1379,42.5,32.4,Rural
1,55001,WI,Adams County,55001950202,10,10.0,2674,180.6,14.8,Rural
2,55001,WI,Adams County,55001950400,10,10.0,4767,91.0,52.4,Rural
3,55001,WI,Adams County,55001950501,10,10.0,1469,44.1,33.3,Rural
4,55001,WI,Adams County,55001950502,10,10.0,4112,96.3,42.7,Rural
...,...,...,...,...,...,...,...,...,...,...
1389,55139,WI,Winnebago County,55139003400,1,1.0,3911,1.5,2589.30,Urban
1390,55139,WI,Winnebago County,55139003500,1,1.0,3064,1.0,2939.50,Urban
1391,55139,WI,Winnebago County,55139003600,1,1.0,4138,1.3,3199.50,Urban
1392,55139,WI,Winnebago County,55139003701,1,1.0,3690,0.8,4607.60,Urban


In [8]:
# Merge based on different column names
# See the specified column names from the example in Git
usda_merged_df = pandas.merge(df, ruca, left_on='State-County-TractFIPSCode', right_on='Total-FIPS-Code', how='left')

# This prevents Unassigned values from being dropped
usda_merged_df.fillna(value={'column_name_in_ruca': 'Unassigned'}, inplace=True)

# This removes NaN values
usda_merged_df = usda_merged_df.replace(np.nan, '')

usda_merged_df

Unnamed: 0,GISAID_Name,ZipCode,CountyFIPS,CensusTract,CensusBlock,Assigned-or-Unassigned-USDA-Classification,State-County-TractFIPSCode,State-County FIPS Code,Select State,Select County,Total-FIPS-Code,Primary RUCA Code 2010,"Secondary RUCA Code, 2010 (see errata)","Tract Population, 2010","Land Area (square miles), 2010","Population Density (per square mile), 2010",Urban-or-Rural-USDA-Classification
0,hCoV-19/USA/WI-CDC-MMB09489547/2021,54902,55139,001100,1001,Assigned,55139001100,55139,WI,Winnebago County,55139001100,1,1.0,4215,1.0,4044.80,Urban
1,hCoV-19/USA/WI-CDC-MMB09621961/2021,53703,55025,001603,2000,Assigned,55025001603,55025,WI,Dane County,55025001603,1,1.0,3414,0.1,50310.20,Urban
2,hCoV-19/USA/WI-CDC-MMB11606910/2021,54476,55073,000900,2013,Assigned,55073000900,55073,WI,Marathon County,55073000900,1,1.0,2240,1.5,1486.20,Urban
3,hCoV-19/USA/WI-CDC-MMB11840790/2021,53511,55105,002500,2002,Assigned,55105002500,55105,WI,Rock County,55105002500,1,1.0,2595,0.6,4182.70,Urban
4,hCoV-19/USA/WI-CDC-MMB11841081/2021,53511,55105,002500,2002,Assigned,55105002500,55105,WI,Rock County,55105002500,1,1.0,2595,0.6,4182.70,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22521,hCoV-19/USA/WI-WSLH-222727/2021,53511,55105,001700,3016,Assigned,55105001700,55105,WI,Rock County,55105001700,1,1.0,5385,1.2,4676.30,Urban
22522,hCoV-19/USA/WI-WSLH-222728/2021,54983,55135,100700,3123,Assigned,55135100700,55135,WI,Waupaca County,55135100700,10,10.0,3359,47.2,71.1,Rural
22523,hCoV-19/USA/WI-WSLH-222729/2021,54914,55087,010900,2004,Assigned,55087010900,55087,WI,Outagamie County,55087010900,1,1.0,2134,0.6,3292.50,Urban
22524,hCoV-19/USA/WI-WSLH-222730/2021,54956,55139,002300,2023,Assigned,55139002300,55139,WI,Winnebago County,55139002300,2,2.0,5288,62.0,85.3,Urban


In [9]:
# This creates a new file that I titled "WI-Midwest-Sars-Cov-2-ASSIGNED-Metadata"
usda_merged_df.to_csv('WI-Midwest-Sars-Cov-2-ASSIGNED-Metadata.csv')

In [10]:
# This creates a new column that categorizes urban or rural based on USDA data
# The function also determines Urban or Rural classification
file_path = 'WI-Midwest-Sars-Cov-2-ASSIGNED-Metadata.csv'
df = pandas.read_csv(file_path)
def urban_or_rural_classification(row):
    primary_ruca_code = row['Primary RUCA Code 2010']
    
    if primary_ruca_code in [1, 2, 3, 4, 5, 6, 7,8]:
        return 'Urban'
    elif primary_ruca_code in [9, 10]:
        return 'Rural'
    else:
        return 'Unknown'

# Create the new column
df['Urban-or-Rural-USDA-Classification'] = df.apply(urban_or_rural_classification, axis=1)

#This removes NaN values
df = df.replace(np.nan, '')

# Save the modified DataFrame back to the CSV file
df.to_csv(file_path, index=False)

df

Unnamed: 0.1,Unnamed: 0,GISAID_Name,ZipCode,CountyFIPS,CensusTract,CensusBlock,Assigned-or-Unassigned-USDA-Classification,State-County-TractFIPSCode,State-County FIPS Code,Select State,Select County,Total-FIPS-Code,Primary RUCA Code 2010,"Secondary RUCA Code, 2010 (see errata)","Tract Population, 2010","Land Area (square miles), 2010","Population Density (per square mile), 2010",Urban-or-Rural-USDA-Classification
0,0,hCoV-19/USA/WI-CDC-MMB09489547/2021,54902,55139.0,1100,1001.0,Assigned,55139001100,55139.0,WI,Winnebago County,55139001100.0,1.0,1.0,4215,1.0,4044.80,Urban
1,1,hCoV-19/USA/WI-CDC-MMB09621961/2021,53703,55025.0,1603,2000.0,Assigned,55025001603,55025.0,WI,Dane County,55025001603.0,1.0,1.0,3414,0.1,50310.20,Urban
2,2,hCoV-19/USA/WI-CDC-MMB11606910/2021,54476,55073.0,900,2013.0,Assigned,55073000900,55073.0,WI,Marathon County,55073000900.0,1.0,1.0,2240,1.5,1486.20,Urban
3,3,hCoV-19/USA/WI-CDC-MMB11840790/2021,53511,55105.0,2500,2002.0,Assigned,55105002500,55105.0,WI,Rock County,55105002500.0,1.0,1.0,2595,0.6,4182.70,Urban
4,4,hCoV-19/USA/WI-CDC-MMB11841081/2021,53511,55105.0,2500,2002.0,Assigned,55105002500,55105.0,WI,Rock County,55105002500.0,1.0,1.0,2595,0.6,4182.70,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22521,22521,hCoV-19/USA/WI-WSLH-222727/2021,53511,55105.0,1700,3016.0,Assigned,55105001700,55105.0,WI,Rock County,55105001700.0,1.0,1.0,5385,1.2,4676.30,Urban
22522,22522,hCoV-19/USA/WI-WSLH-222728/2021,54983,55135.0,100700,3123.0,Assigned,55135100700,55135.0,WI,Waupaca County,55135100700.0,10.0,10.0,3359,47.2,71.1,Rural
22523,22523,hCoV-19/USA/WI-WSLH-222729/2021,54914,55087.0,10900,2004.0,Assigned,55087010900,55087.0,WI,Outagamie County,55087010900.0,1.0,1.0,2134,0.6,3292.50,Urban
22524,22524,hCoV-19/USA/WI-WSLH-222730/2021,54956,55139.0,2300,2023.0,Assigned,55139002300,55139.0,WI,Winnebago County,55139002300.0,2.0,2.0,5288,62.0,85.3,Urban


In [11]:
# Here we read in the file from the CENSUS classification
import pandas
census = pandas.read_csv('NHGIS-CensusTract-Data-WI.csv',dtype=object)

census

Unnamed: 0,GISJOIN,YEAR,GEOID,GEOCODE,STATE,COUNTY,COUNTYA,TRACTA,U7I001,U7I002,U7I003,UrbanThreshold,RuralThreshold
0,G5500010950100,2020,1400000US55001950100,55001950100,Wisconsin,Adams County,1,950100,3242,0,3242,0,100
1,G5500010950201,2020,1400000US55001950201,55001950201,Wisconsin,Adams County,1,950201,1386,0,1386,0,100
2,G5500010950203,2020,1400000US55001950203,55001950203,Wisconsin,Adams County,1,950203,935,0,935,0,100
3,G5500010950204,2020,1400000US55001950204,55001950204,Wisconsin,Adams County,1,950204,1646,0,1646,0,100
4,G5500010950400,2020,1400000US55001950400,55001950400,Wisconsin,Adams County,1,950400,4491,0,4491,0,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1537,G5501410011300,2020,1400000US55141011300,55141011300,Wisconsin,Wood County,141,11300,4477,4441,36,99.1958901,0.804109895
1538,G5501410011400,2020,1400000US55141011400,55141011400,Wisconsin,Wood County,141,11400,5754,5557,197,96.57629475,3.423705249
1539,G5501410011500,2020,1400000US55141011500,55141011500,Wisconsin,Wood County,141,11500,5975,4754,1221,79.56485356,20.43514644
1540,G5501410011600,2020,1400000US55141011600,55141011600,Wisconsin,Wood County,141,11600,5037,502,4535,9.966249752,90.03375025


In [12]:
usda_merged_df = df

usda_merged_df

Unnamed: 0.1,Unnamed: 0,GISAID_Name,ZipCode,CountyFIPS,CensusTract,CensusBlock,Assigned-or-Unassigned-USDA-Classification,State-County-TractFIPSCode,State-County FIPS Code,Select State,Select County,Total-FIPS-Code,Primary RUCA Code 2010,"Secondary RUCA Code, 2010 (see errata)","Tract Population, 2010","Land Area (square miles), 2010","Population Density (per square mile), 2010",Urban-or-Rural-USDA-Classification
0,0,hCoV-19/USA/WI-CDC-MMB09489547/2021,54902,55139.0,1100,1001.0,Assigned,55139001100,55139.0,WI,Winnebago County,55139001100.0,1.0,1.0,4215,1.0,4044.80,Urban
1,1,hCoV-19/USA/WI-CDC-MMB09621961/2021,53703,55025.0,1603,2000.0,Assigned,55025001603,55025.0,WI,Dane County,55025001603.0,1.0,1.0,3414,0.1,50310.20,Urban
2,2,hCoV-19/USA/WI-CDC-MMB11606910/2021,54476,55073.0,900,2013.0,Assigned,55073000900,55073.0,WI,Marathon County,55073000900.0,1.0,1.0,2240,1.5,1486.20,Urban
3,3,hCoV-19/USA/WI-CDC-MMB11840790/2021,53511,55105.0,2500,2002.0,Assigned,55105002500,55105.0,WI,Rock County,55105002500.0,1.0,1.0,2595,0.6,4182.70,Urban
4,4,hCoV-19/USA/WI-CDC-MMB11841081/2021,53511,55105.0,2500,2002.0,Assigned,55105002500,55105.0,WI,Rock County,55105002500.0,1.0,1.0,2595,0.6,4182.70,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22521,22521,hCoV-19/USA/WI-WSLH-222727/2021,53511,55105.0,1700,3016.0,Assigned,55105001700,55105.0,WI,Rock County,55105001700.0,1.0,1.0,5385,1.2,4676.30,Urban
22522,22522,hCoV-19/USA/WI-WSLH-222728/2021,54983,55135.0,100700,3123.0,Assigned,55135100700,55135.0,WI,Waupaca County,55135100700.0,10.0,10.0,3359,47.2,71.1,Rural
22523,22523,hCoV-19/USA/WI-WSLH-222729/2021,54914,55087.0,10900,2004.0,Assigned,55087010900,55087.0,WI,Outagamie County,55087010900.0,1.0,1.0,2134,0.6,3292.50,Urban
22524,22524,hCoV-19/USA/WI-WSLH-222730/2021,54956,55139.0,2300,2023.0,Assigned,55139002300,55139.0,WI,Winnebago County,55139002300.0,2.0,2.0,5288,62.0,85.3,Urban


In [13]:
df['State-County-TractFIPSCode'] = df['State-County-TractFIPSCode'].astype(str)

# Merge based on different column names
merged_df = pandas.merge(df, census, left_on='State-County-TractFIPSCode', right_on='GEOCODE', how='left')

#This prevents Unassigned values from being dropped
merged_df['GEOCODE'].fillna(value='Unassigned', inplace=True)

#This removes NaN values
merged_df.replace(np.nan, '', inplace=True)

merged_df

Unnamed: 0.1,Unnamed: 0,GISAID_Name,ZipCode,CountyFIPS,CensusTract,CensusBlock,Assigned-or-Unassigned-USDA-Classification,State-County-TractFIPSCode,State-County FIPS Code,Select State,...,GEOCODE,STATE,COUNTY,COUNTYA,TRACTA,U7I001,U7I002,U7I003,UrbanThreshold,RuralThreshold
0,0,hCoV-19/USA/WI-CDC-MMB09489547/2021,54902,55139.0,1100,1001.0,Assigned,55139001100,55139.0,WI,...,55139001100,Wisconsin,Winnebago County,139,1100,4255,4255,0,100,0
1,1,hCoV-19/USA/WI-CDC-MMB09621961/2021,53703,55025.0,1603,2000.0,Assigned,55025001603,55025.0,WI,...,55025001603,Wisconsin,Dane County,25,1603,6733,6733,0,100,0
2,2,hCoV-19/USA/WI-CDC-MMB11606910/2021,54476,55073.0,900,2013.0,Assigned,55073000900,55073.0,WI,...,55073000900,Wisconsin,Marathon County,73,900,2189,2189,0,100,0
3,3,hCoV-19/USA/WI-CDC-MMB11840790/2021,53511,55105.0,2500,2002.0,Assigned,55105002500,55105.0,WI,...,55105002500,Wisconsin,Rock County,105,2500,2646,2646,0,100,0
4,4,hCoV-19/USA/WI-CDC-MMB11841081/2021,53511,55105.0,2500,2002.0,Assigned,55105002500,55105.0,WI,...,55105002500,Wisconsin,Rock County,105,2500,2646,2646,0,100,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22521,22521,hCoV-19/USA/WI-WSLH-222727/2021,53511,55105.0,1700,3016.0,Assigned,55105001700,55105.0,WI,...,55105001700,Wisconsin,Rock County,105,1700,5096,5096,0,100,0
22522,22522,hCoV-19/USA/WI-WSLH-222728/2021,54983,55135.0,100700,3123.0,Assigned,55135100700,55135.0,WI,...,55135100700,Wisconsin,Waupaca County,135,100700,3375,144,3231,4.266666667,95.73333333
22523,22523,hCoV-19/USA/WI-WSLH-222729/2021,54914,55087.0,10900,2004.0,Assigned,55087010900,55087.0,WI,...,55087010900,Wisconsin,Outagamie County,87,10900,2240,2240,0,100,0
22524,22524,hCoV-19/USA/WI-WSLH-222730/2021,54956,55139.0,2300,2023.0,Assigned,55139002300,55139.0,WI,...,55139002300,Wisconsin,Winnebago County,139,2300,5691,241,5450,4.234756633,95.76524337


In [20]:
merged_df.to_csv('WI-Combined-Assigned-Metadata.csv')

In [27]:
# This creates a new column that vaugly categorizes urban or rural based on USDA data
# Function to determine Urban or Rural classification

file_path = 'WI-Combined-Assigned-Metadata.csv'
df = pandas.read_csv(file_path)

df['NHGIS_Urban_or_Rural'] = ''

# Define conditions and update the 'Urban_or_Rural' column
df.loc[(df['UrbanThreshold'] > 50) & (df['RuralThreshold'] < 50), 'NHGIS_Urban_or_Rural'] = 'Urban'
df.loc[(df['RuralThreshold'] > 50) & (df['UrbanThreshold'] < 50), 'NHGIS_Urban_or_Rural'] = 'Rural'

#This removes NaN values
df = df.replace(np.nan, '')

# Save the modified DataFrame back to the CSV file
df.to_csv(file_path, index=False)

df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,GISAID_Name,ZipCode,CountyFIPS,CensusTract,CensusBlock,Assigned-or-Unassigned-USDA-Classification,State-County-TractFIPSCode,State-County FIPS Code,...,STATE,COUNTY,COUNTYA,TRACTA,U7I001,U7I002,U7I003,UrbanThreshold,RuralThreshold,NHGIS_Urban_or_Rural
0,0,0,hCoV-19/USA/WI-CDC-MMB09489547/2021,54902,55139.0,1100,1001.0,Assigned,55139001100,55139.0,...,Wisconsin,Winnebago County,139.0,1100.0,4255.0,4255.0,0.0,100.0,0.0,Urban
1,1,1,hCoV-19/USA/WI-CDC-MMB09621961/2021,53703,55025.0,1603,2000.0,Assigned,55025001603,55025.0,...,Wisconsin,Dane County,25.0,1603.0,6733.0,6733.0,0.0,100.0,0.0,Urban
2,2,2,hCoV-19/USA/WI-CDC-MMB11606910/2021,54476,55073.0,900,2013.0,Assigned,55073000900,55073.0,...,Wisconsin,Marathon County,73.0,900.0,2189.0,2189.0,0.0,100.0,0.0,Urban
3,3,3,hCoV-19/USA/WI-CDC-MMB11840790/2021,53511,55105.0,2500,2002.0,Assigned,55105002500,55105.0,...,Wisconsin,Rock County,105.0,2500.0,2646.0,2646.0,0.0,100.0,0.0,Urban
4,4,4,hCoV-19/USA/WI-CDC-MMB11841081/2021,53511,55105.0,2500,2002.0,Assigned,55105002500,55105.0,...,Wisconsin,Rock County,105.0,2500.0,2646.0,2646.0,0.0,100.0,0.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22521,22521,22521,hCoV-19/USA/WI-WSLH-222727/2021,53511,55105.0,1700,3016.0,Assigned,55105001700,55105.0,...,Wisconsin,Rock County,105.0,1700.0,5096.0,5096.0,0.0,100.0,0.0,Urban
22522,22522,22522,hCoV-19/USA/WI-WSLH-222728/2021,54983,55135.0,100700,3123.0,Assigned,55135100700,55135.0,...,Wisconsin,Waupaca County,135.0,100700.0,3375.0,144.0,3231.0,4.266667,95.733333,Rural
22523,22523,22523,hCoV-19/USA/WI-WSLH-222729/2021,54914,55087.0,10900,2004.0,Assigned,55087010900,55087.0,...,Wisconsin,Outagamie County,87.0,10900.0,2240.0,2240.0,0.0,100.0,0.0,Urban
22524,22524,22524,hCoV-19/USA/WI-WSLH-222730/2021,54956,55139.0,2300,2023.0,Assigned,55139002300,55139.0,...,Wisconsin,Winnebago County,139.0,2300.0,5691.0,241.0,5450.0,4.234757,95.765243,Rural


In [37]:
# Finally, read in the Latitude and Longitude data for Wisconsin for further centroid analysis
wisconsin = pandas.read_csv('Wisconsin-Lat-Long.csv',dtype=object)

wisconsin

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,GEOID,GEOIDFQ,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON
0,55,001,950204,55001950204,1400000US55001950204,9502.04,Census Tract 9502.04,G5020,S,353539615,49159328,+44.0462804,-089.6886609
1,55,001,950203,55001950203,1400000US55001950203,9502.03,Census Tract 9502.03,G5020,S,114351980,1368834,+44.1209611,-089.7278541
2,55,009,940002,55009940002,1400000US55009940002,9400.02,Census Tract 9400.02,G5020,S,11531694,0,+44.5157530,-088.1259731
3,55,117,010504,55117010504,1400000US55117010504,105.04,Census Tract 105.04,G5020,S,38350718,219284,+43.7519968,-088.0088993
4,55,117,011302,55117011302,1400000US55117011302,113.02,Census Tract 113.02,G5020,S,77506338,591219,+43.5799760,-087.8668431
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1537,55,043,960700,55043960700,1400000US55043960700,9607,Census Tract 9607,G5020,S,22255352,0,+42.8525133,-090.7126170
1538,55,043,960800,55043960800,1400000US55043960800,9608,Census Tract 9608,G5020,S,253185855,15987542,+42.7117441,-090.6232729
1539,55,043,960100,55043960100,1400000US55043960100,9601,Census Tract 9601,G5020,S,364751533,3687082,+43.0851304,-090.5338688
1540,55,043,960500,55043960500,1400000US55043960500,9605,Census Tract 9605,G5020,S,569067967,31187513,+42.8066244,-090.8677346


In [38]:
# Convert the 'Total-FIPS-Code' column to integer type
df['State_County_TractFIPSCode'] = df['State_County_TractFIPSCode'].astype('int64')

# Convert the 'GEOID' column to integer type in the 'wisconsin' DataFrame
wisconsin['GEOID'] = wisconsin['GEOID'].astype('int64')

# Merge based on the corrected column names and data types
merged_lat_long_df = pandas.merge(df, wisconsin, left_on='State_County_TractFIPSCode', right_on='GEOID', how='left')

#This removes NaN values
merged_lat_long_df.replace(np.nan, '', inplace=True)

merged_lat_long_df

Unnamed: 0,Unnamed__0_1,Unnamed__0,GISAID_Name,ZipCode,CountyFIPS,CensusTract,CensusBlock,Assigned_or_Unassigned_USDA_Classification,State_County_TractFIPSCode,State_County_FIPS_Code,...,GEOID_y,GEOIDFQ,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON
0,0,0,hCoV-19/USA/WI-CDC-MMB09489547/2021,54902,55139.0,1100,1001.0,Assigned,55139001100,55139.0,...,55139001100.0,1400000US55139001100,11,Census Tract 11,G5020,S,2698989,48926,+44.0197668,-088.5710558
1,1,1,hCoV-19/USA/WI-CDC-MMB09621961/2021,53703,55025.0,1603,2000.0,Assigned,55025001603,55025.0,...,55025001603.0,1400000US55025001603,16.03,Census Tract 16.03,G5020,S,175758,0,+43.0734025,-089.3954294
2,2,2,hCoV-19/USA/WI-CDC-MMB11606910/2021,54476,55073.0,900,2013.0,Assigned,55073000900,55073.0,...,55073000900.0,1400000US55073000900,9,Census Tract 9,G5020,S,4526091,2594147,+44.9133476,-089.6214783
3,3,3,hCoV-19/USA/WI-CDC-MMB11840790/2021,53511,55105.0,2500,2002.0,Assigned,55105002500,55105.0,...,55105002500.0,1400000US55105002500,25,Census Tract 25,G5020,S,1609946,0,+42.5370831,-089.0159276
4,4,4,hCoV-19/USA/WI-CDC-MMB11841081/2021,53511,55105.0,2500,2002.0,Assigned,55105002500,55105.0,...,55105002500.0,1400000US55105002500,25,Census Tract 25,G5020,S,1609946,0,+42.5370831,-089.0159276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22521,22521,22521,hCoV-19/USA/WI-WSLH-222727/2021,53511,55105.0,1700,3016.0,Assigned,55105001700,55105.0,...,55105001700.0,1400000US55105001700,17,Census Tract 17,G5020,S,2982645,270209,+42.5152763,-089.0450969
22522,22522,22522,hCoV-19/USA/WI-WSLH-222728/2021,54983,55135.0,100700,3123.0,Assigned,55135100700,55135.0,...,55135100700.0,1400000US55135100700,1007,Census Tract 1007,G5020,S,122289940,8429487,+44.3340999,-088.9769857
22523,22523,22523,hCoV-19/USA/WI-WSLH-222729/2021,54914,55087.0,10900,2004.0,Assigned,55087010900,55087.0,...,55087010900.0,1400000US55087010900,109,Census Tract 109,G5020,S,1737526,193285,+44.2499628,-088.4330973
22524,22524,22524,hCoV-19/USA/WI-WSLH-222730/2021,54956,55139.0,2300,2023.0,Assigned,55139002300,55139.0,...,55139002300.0,1400000US55139002300,23,Census Tract 23,G5020,S,160560110,6858028,+44.1664512,-088.5737102


In [40]:
# Further clean up column headers, removing spaces and other punctuation 
merged_lat_long_df.columns = merged_lat_long_df.columns.str.replace(r'[\(\)]', '').str.replace(r'[\s\W]', '_', regex=True)
# Further clean up column headers, removing spaces and other punctuation 
merged_lat_long_df = merged_lat_long_df.rename(columns={'Secondary RUCA Code, 2010 (see errata)': 'Secondary RUCA Code 2010', 
                        'Land Area (square miles), 2010' : 'Land Area sq miles 2010', 
                        'Population Density (per square mile), 2010' : 'Pop Density per sq mile 2010'
                       })
merged_lat_long_df.columns = merged_lat_long_df.columns.str.replace(r'[\(\)]', '').str.replace(r'[\s\W]', '_', regex=True)

merged_lat_long_df

Unnamed: 0,Unnamed__0_1,Unnamed__0,GISAID_Name,ZipCode,CountyFIPS,CensusTract,CensusBlock,Assigned_or_Unassigned_USDA_Classification,State_County_TractFIPSCode,State_County_FIPS_Code,...,GEOID_y,GEOIDFQ,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON
0,0,0,hCoV-19/USA/WI-CDC-MMB09489547/2021,54902,55139.0,1100,1001.0,Assigned,55139001100,55139.0,...,55139001100.0,1400000US55139001100,11,Census Tract 11,G5020,S,2698989,48926,+44.0197668,-088.5710558
1,1,1,hCoV-19/USA/WI-CDC-MMB09621961/2021,53703,55025.0,1603,2000.0,Assigned,55025001603,55025.0,...,55025001603.0,1400000US55025001603,16.03,Census Tract 16.03,G5020,S,175758,0,+43.0734025,-089.3954294
2,2,2,hCoV-19/USA/WI-CDC-MMB11606910/2021,54476,55073.0,900,2013.0,Assigned,55073000900,55073.0,...,55073000900.0,1400000US55073000900,9,Census Tract 9,G5020,S,4526091,2594147,+44.9133476,-089.6214783
3,3,3,hCoV-19/USA/WI-CDC-MMB11840790/2021,53511,55105.0,2500,2002.0,Assigned,55105002500,55105.0,...,55105002500.0,1400000US55105002500,25,Census Tract 25,G5020,S,1609946,0,+42.5370831,-089.0159276
4,4,4,hCoV-19/USA/WI-CDC-MMB11841081/2021,53511,55105.0,2500,2002.0,Assigned,55105002500,55105.0,...,55105002500.0,1400000US55105002500,25,Census Tract 25,G5020,S,1609946,0,+42.5370831,-089.0159276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22521,22521,22521,hCoV-19/USA/WI-WSLH-222727/2021,53511,55105.0,1700,3016.0,Assigned,55105001700,55105.0,...,55105001700.0,1400000US55105001700,17,Census Tract 17,G5020,S,2982645,270209,+42.5152763,-089.0450969
22522,22522,22522,hCoV-19/USA/WI-WSLH-222728/2021,54983,55135.0,100700,3123.0,Assigned,55135100700,55135.0,...,55135100700.0,1400000US55135100700,1007,Census Tract 1007,G5020,S,122289940,8429487,+44.3340999,-088.9769857
22523,22523,22523,hCoV-19/USA/WI-WSLH-222729/2021,54914,55087.0,10900,2004.0,Assigned,55087010900,55087.0,...,55087010900.0,1400000US55087010900,109,Census Tract 109,G5020,S,1737526,193285,+44.2499628,-088.4330973
22524,22524,22524,hCoV-19/USA/WI-WSLH-222730/2021,54956,55139.0,2300,2023.0,Assigned,55139002300,55139.0,...,55139002300.0,1400000US55139002300,23,Census Tract 23,G5020,S,160560110,6858028,+44.1664512,-088.5737102


In [41]:
merged_lat_long_df.to_csv('WI-Assigned-Metadata-Combined-Lat-Long.csv')

In [44]:
# This cleans out unnecessary columns that are no longer needed in the file
csv_file_path = 'WI-Assigned-Metadata-Combined-Lat-Long.csv'

df = pandas.read_csv(csv_file_path)

columns_to_drop = [
    "Unnamed__0_1", "Unnamed__0", "State_County_TractFIPSCode", 
    "State_County_FIPS_Code", "Tract_Population__2010", 
    "Land_Area__square_miles___2010", 
    "Population_Density__per_square_mile___2010", "GEOID_x", 
    "COUNTYA", "TRACTA", "U7I001", "U7I002", "STATEFP", 
    "COUNTYFP", "TRACTCE", "GEOID_y", "NAME", "NAMELSAD", 
    "MTFCC", "FUNCSTAT", "ALAND", "AWATER"
]

df_cleaned = df.drop(columns=columns_to_drop)

output_file_path = 'WI-Assigned-Metadata-Cleaned.csv'

df_cleaned.to_csv(output_file_path, index=False)

print(f"Cleaned CSV saved as: {output_file_path}")

Cleaned CSV saved as: WI-Assigned-Metadata-Cleaned.csv
