# Counting Sequences From Metadata and Grouping By Urban and Rural, High and Low Vax Counties

### July 22, 2023

This script uses the structure from the "m1m2_urb_rur_metadata_seqcounts.ipynb" in the mw-phylo-analysis-scripts directory to group counties into urban and rural, high and low vaccination categories and count sequence availability for these groups from the input file. The input file can be a tsv or csv from GISAID or Genbank and ideally has been processed using the "clean_wicounties_metadata_tsv_2023_05_03.ipynb" script in the MWPhylo_Cleaning_for_NextstrainPipeline directory. The script works by creating lists of counties defined by the different methods and vaccination groups and using a function to extract county, date, and sequence counts from the file to a dictionary. Within the dictionary, the counties and date (3 letter month abbreviation and 4 digit year) are the keys with the sequences counts for the county and date as the values. 


In [59]:
import pandas as pd
import csv
import calendar
import datetime 

In [60]:
# Define the "method 1" or "m1" urban and rural WI counties as 2013 NCHSUR Codes 1-4 being Metro or "urban"
# and 5-6 being Nonmetro or "rural"
m1_urban = ['Brown', 'Calumet', 'Chippewa', 'Columbia', 'Dane', 'Douglas', 'Eau Claire', 'Fond du Lac', 
            'Green', 'Iowa', 'Kenosha', 'Kewaunee', 'La Crosse', 'Marathon', 'Milwaukee', 'Oconto', 
            'Outagamie', 'Ozaukee', 'Pierce', 'Racine', 'Rock', 'St. Croix', 'Sheboygan', 'Washington', 
            'Waukesha', 'Winnebago']

m1_rural = ['Adams', 'Ashland', 'Barron', 'Bayfield', 'Buffalo', 'Burnett', 'Clark', 'Crawford', 
            'Dodge', 'Door', 'Dunn', 'Florence', 'Forest', 'Grant', 'Green Lake', 'Iron', 
            'Jackson', 'Jefferson', 'Juneau', 'Lafayette', 'Langlade', 'Lincoln', 'Manitowoc', 
            'Marinette', 'Marquette', 'Menominee', 'Monroe', 'Oneida', 'Pepin', 'Polk', 'Portage', 
            'Price', 'Richland', 'Rusk', 'Sauk', 'Sawyer', 'Shawano', 'Taylor', 'Trempealeau', 
            'Vernon', 'Vilas', 'Walworth', 'Washburn', 'Waupaca', 'Waushara', 'Wood']

# Define the "method 2" or "m2" urban and rural WI counties as 2013 NCHSUR Codes 1-3 being Metro or "urban"
# and 4-6 being Nonmetro or "rural"


m2_urban= ['Brown',
'Columbia',
'Dane',
'Green',
'Iowa',
'Kenosha',
'Milwaukee',
'Ozaukee',
'Pierce',
'St. Croix', 
'Washington',
'Waukesha',
'Douglas',
'Kewaunee',
'Oconto']

m2_rural = ["Calumet", "Chippewa", "Eau Claire", "Fond du Lac", "La Crosse", 
            "Marathon", "Outagamie", "Racine", "Rock", "Sheboygan", "Winnebago", 
            "Dodge", "Dunn", "Florence", "Grant", "Jefferson", "Lincoln", 
            "Manitowoc", "Marinette", "Menominee", "Portage", "Sauk", "Shawano", 
            "Walworth", "Wood", "Adams", "Ashland", "Barron", "Bayfield", "Buffalo", 
            "Burnett", "Clark", "Crawford", "Door", "Forest", "Green Lake", "Iron", 
            "Jackson", "Juneau", "Lafayette", "Langlade", "Marquette", "Monroe", 
            "Oneida", "Pepin", "Polk", "Price", 
            "Richland", "Rusk", "Sawyer", "Taylor", "Trempealeau", "Vernon", 
            "Vilas", "Washburn", "Waupaca", "Waushara"]

######
# Assigning counties to method 1 urban and rural, high and low vaccination categories
######

# METHOD 1
m1rural_high = ["Ashland",
"Bayfield",
"Crawford",
"Door",
"Forest",
"Iron",
"Lafayette", 
"Manitowoc",
"Menominee", 
"Oneida",
"Portage",
"Price",
"Richland",
"Sauk",
"Trempealeau", 
"Vilas",
"Washburn",
"Wood"]

m1rural_low = ["Adams", "Barron", 
"Buffalo",
"Burnett",
"Clark",
"Dodge", 
"Dunn",
"Florence",
"Grant",
"Green Lake",
"Jackson",
"Juneau",
"Langlade",
"Lincoln",
"Marinette",
"Marquette", 
"Monroe",
"Pepin",
"Polk",
"Rusk",
"Sawyer",
"Shawano",
"Taylor",
"Vernon",
"Walworth",
"Waupaca",
"Waushara"]

# Assigning counties to Urban High-Low Cateogries 
m1urban_high = ['Brown',
'Columbia',
'Dane',
'Eau Claire',
'Green',
'Iowa',
'Kenosha',
'La Crosse',
'Marathon',
'Milwaukee',
'Outagamie',
'Ozaukee',
'Racine',
'Rock',
'Sheboygan',
'Waukesha',
'Winnebago']

m1urban_low = ['Calumet',
'Chippewa',
'Douglas',
'Fond du Lac',
'Kewaunee',
'Oconto',
'Pierce',
'St. Croix']

mid_vax = ['Washington', "Jefferson"]


######
# Assigning counties to method 2 urban and rural, high and low vaccination categories
######

# METHOD 2

# Assigning counties to Rural High-Low categories
m2rural_high = ["Ashland",
"Bayfield",
"Crawford",
"Door",
'Eau Claire',
"Forest",
"Iron",
'La Crosse',
"Lafayette", 
'Marathon',
"Manitowoc",
"Menominee", 
"Oneida",
'Outagamie',
"Portage",
"Price",
"Richland", 'Racine',
'Rock',
"Sauk",
'Sheboygan',
"Trempealeau", 
"Vilas",
"Washburn",
'Winnebago',
"Wood", 'ashland',
'bayfield',
'crawford',
'door',
'eau claire',
'forest',
'iron',
'la crosse',
'lafayette',
'marathon',
'manitowoc',
'menominee',
'oneida',
'outagamie',
'portage',
'price', 'racine',
'richland',
'rock',
'sauk',
'sheboygan',
'trempealeau',
'vilas',
'washburn',
'winnebago',
'wood']

m2rural_low = ["Adams", "Barron", 
"Buffalo",
"Burnett",
'Calumet',
'Chippewa',
"Clark",
"Dodge", 
"Dunn",
"Florence",
'Fond du Lac',
"Grant",
"Green Lake",
"Jackson", 
"Juneau",
"Langlade",
"Lincoln",
"Marinette",
"Marquette", 
"Monroe",
"Pepin",
"Polk",
"Rusk",
"Sawyer",
"Shawano",
"Taylor",
"Vernon",
"Walworth",
"Waupaca",
"Waushara", 'adams',
'barron',
'buffalo',
'burnett',
'calumet',
'chippewa',
'clark',
'dodge',
'dunn',
'florence',
'fond du lac',
'grant',
'green lake',
'jackson',
'juneau',
'langlade',
'lincoln',
'marinette',
'marquette',
'monroe',
'pepin',
'polk',
'rusk',
'sawyer',
'shawano',
'taylor',
'vernon',
'walworth',
'waupaca',
'waushara']

# Assigning counties to Urban High-Low Cateogries 
m2urban_high = ['Brown',
'Columbia',
'Dane',
'Green',
'Iowa',
'Kenosha',
'Milwaukee',
'Ozaukee',
'Waukesha']

m2urban_low = ['Douglas',
'Kewaunee',
'Oconto',
'Pierce',
'St. Croix', 
'douglas',
'kewaunee',
'oconto',
'pierce',
'st.croix']


In [61]:
# print the lists and get the number of counties:
#print(', '.join(m2_urban))
#print(len(m2_urban))
    

In [62]:
def seq_input_file(input_file):
    output_dict = {}
    with open(input_file, "r") as infile:
        delimiter = ',' if input_file.endswith('.csv') else '\t'#give the option to use csv or tsv

        tsv_reader = csv.reader(infile, delimiter=delimiter)
        
        for line in tsv_reader:
        # update the column indices based on the input file type
            if input_file.endswith('.csv'):
                collection_date = line[8]
                location = line[6]
            else:
                collection_date = line[4]
                location = line[8]
                

            if len(location.split("/")) == 0:
                county = "unknown"
            else:
                counties = location.split(" County")
                county = counties[0].strip()  # Extract the first element from the list
               # county = county.replace(" ", "_") #+ "_County"  # Format county name with spaces

                date_parts = collection_date.split("-")

                if len(date_parts) >= 2:
                    year = int(date_parts[0])
                    month = int(date_parts[1])
                    month_abbr = calendar.month_abbr[month]

                    year_month = month_abbr + '-' + str(year)

                    if '0 County' in location:
                        pass
                    if county in m1_urban or county in m1_rural or county in m2_urban or county in m2_rural:
                        if county not in output_dict:
                            output_dict[county] = {}
                        if year_month not in output_dict[county]:
                            output_dict[county][year_month] = 1
                        else:
                            output_dict[county][year_month] += 1

    return output_dict

# Example usage:
# input_file = '/Users/mavoeg/computational_folder/gh_folder/ncov/data/clean_ready_for_nextstrain/nextstrain_may_2023/06_01_2023urbrur_hilomid.tsv' # GISAID dataset
# output_data = process_input_file(input_file)
# print(output_data)


In [63]:
#input_file = '/Users/mavoeg/computational_folder/gh_folder/ncov/data/clean_ready_for_nextstrain/nextstrain_may_2023/06_01_2023urbrur_hilomid.tsv' # Gisaid dataset
#input_file = '/Users/mavoeg/Desktop/SARS/Wisconsin/WI_Data_Counties/genbank/2023-06-30-CDC_Contract_Seq_NoNAs.csv' # CDC dataset

output_data = seq_input_file(input_file)
print(output_data)

{'Adams': {'Jun-2020': 2, 'Mar-2020': 1, 'Aug-2020': 2, 'Sep-2020': 1, 'Jan-2021': 3, 'Dec-2020': 3, 'Feb-2021': 3, 'Oct-2020': 4, 'Nov-2020': 2, 'Mar-2021': 8, 'Apr-2021': 10, 'May-2021': 5, 'Jul-2021': 4, 'Aug-2021': 10, 'Sep-2021': 12, 'Oct-2021': 4, 'Nov-2021': 3, 'Dec-2021': 8, 'Mar-2022': 2, 'May-2022': 3, 'Jun-2022': 1, 'Jul-2022': 1, 'Aug-2022': 1}, 'Ashland': {'Oct-2020': 6, 'Sep-2020': 6, 'Jun-2020': 1, 'Dec-2020': 4, 'Feb-2021': 3, 'Nov-2020': 1, 'Apr-2021': 3, 'Jul-2021': 1, 'Aug-2021': 2, 'Sep-2021': 4, 'Oct-2021': 6, 'Nov-2021': 1, 'Dec-2021': 1, 'Jan-2022': 1, 'Mar-2022': 1, 'May-2022': 1, 'Jun-2022': 9, 'Jul-2022': 5, 'Aug-2022': 3, 'Oct-2022': 6, 'Sep-2022': 1, 'Jan-2023': 1, 'Dec-2022': 2, 'Nov-2022': 3}, 'Barron': {'Sep-2020': 8, 'Oct-2020': 5, 'Jul-2020': 6, 'May-2020': 1, 'Jun-2020': 2, 'Dec-2020': 10, 'Jan-2021': 3, 'Mar-2021': 22, 'Apr-2021': 16, 'May-2021': 2, 'Jun-2021': 2, 'Jul-2021': 12, 'Aug-2021': 46, 'Sep-2021': 37, 'Oct-2021': 16, 'Nov-2021': 7, 'Dec-2021

In [65]:
seq_df = pd.DataFrame.from_dict(output_data, orient="index") # make a df from the dictionary
seq_df = seq_df.sort_index().sort_index(axis=1)  # Sort the index (county) and column names 


# Fill NaN values with 0
seq_df = seq_df.fillna(0)
seq_df = seq_df.rename(index={'Saint_Croix': 'St.Croix', 'Green_Lake': 'Green Lake', 'Fond_du_Lac': 'Fond du Lac', 'Eau_Claire': 'Eau Claire', 'La_Crosse ': 'La Crosse'})



# Add columns m1_urb_rur and m2_urb_rur "urban", "rural" by the new urban rural definition using the lists above
seq_df['m1_urbrur'] = seq_df.index.map(lambda county: 'urban' if county in m1_urban else 'rural')

seq_df['m2_urbrur'] = seq_df.index.map(lambda county: 'urban' if county in m2_urban else 'rural')

print(seq_df)



           Apr-2020  Apr-2021  Apr-2022  Aug-2020  Aug-2021  Aug-2022  \
Adams           0.0      10.0       0.0       2.0      10.0       1.0   
Ashland         0.0       3.0       0.0       0.0       2.0       3.0   
Barron          0.0      16.0      14.0       0.0      46.0       3.0   
Bayfield        0.0       3.0       2.0       0.0       3.0       0.0   
Brown          93.0     330.0       1.0       3.0      64.0       5.0   
...             ...       ...       ...       ...       ...       ...   
Waukesha        9.0      46.0      68.0       1.0     105.0     104.0   
Waupaca         0.0       1.0       1.0       0.0       5.0       3.0   
Waushara        0.0       0.0       1.0       0.0       3.0       0.0   
Winnebago       4.0      42.0      12.0       0.0      16.0      23.0   
Wood            0.0       9.0      63.0       0.0      47.0       8.0   

           Dec-2020  Dec-2021  Dec-2022  Feb-2021  ...  Nov-2021  Nov-2022  \
Adams           3.0       8.0       0.0      

In [66]:
seq_df = pd.DataFrame.from_dict(output_dict, orient="index") # make a df from the dictionary
seq_df = seq_df.sort_index().sort_index(axis=1)  # Sort the index (county) and column names 


# Fill NaN values with 0
seq_df = seq_df.fillna(0)
seq_df = seq_df.rename(index={'Saint Croix': 'St.Croix', 'Green_Lake': 'Green Lake', 'Fond_du_Lac': 'Fond du Lac', 'Eau_Claire': 'Eau Claire', 'La_Crosse ': 'La Crosse'})



# Add columns m1_urb_rur and m2_urb_rur "urban", "rural" by the new urban rural definition using the lists above
seq_df['m1_urbrur'] = seq_df.index.map(lambda county: 'urban' if county in m1_urban else 'rural')

seq_df['m2_urbrur'] = seq_df.index.map(lambda county: 'urban' if county in m2_urban else 'rural')

print(seq_df)


NameError: name 'output_dict' is not defined

In [67]:
# make the m1 and m2 county dictionaries for urban and rural, high and low vaccination groups

m1county_category_dict = {
    **{county: "rural high" for county in m1rural_high}, #** is dictionary unpacking, merging multiple dictionaries into 1
    **{county: "rural low" for county in m1rural_low},
    **{county: "urban high" for county in m1urban_high},
    **{county: "urban low" for county in m1urban_low}
}


m2county_category_dict = {
    **{county: "rural high" for county in m2rural_high},
    **{county: "rural low" for county in m2rural_low},
    **{county: "urban high" for county in m2urban_high},
    **{county: "urban low" for county in m2urban_low}
}

print(m2county_category_dict)



{'Ashland': 'rural high', 'Bayfield': 'rural high', 'Crawford': 'rural high', 'Door': 'rural high', 'Eau Claire': 'rural high', 'Forest': 'rural high', 'Iron': 'rural high', 'La Crosse': 'rural high', 'Lafayette': 'rural high', 'Marathon': 'rural high', 'Manitowoc': 'rural high', 'Menominee': 'rural high', 'Oneida': 'rural high', 'Outagamie': 'rural high', 'Portage': 'rural high', 'Price': 'rural high', 'Richland': 'rural high', 'Racine': 'rural high', 'Rock': 'rural high', 'Sauk': 'rural high', 'Sheboygan': 'rural high', 'Trempealeau': 'rural high', 'Vilas': 'rural high', 'Washburn': 'rural high', 'Winnebago': 'rural high', 'Wood': 'rural high', 'ashland': 'rural high', 'bayfield': 'rural high', 'crawford': 'rural high', 'door': 'rural high', 'eau claire': 'rural high', 'forest': 'rural high', 'iron': 'rural high', 'la crosse': 'rural high', 'lafayette': 'rural high', 'marathon': 'rural high', 'manitowoc': 'rural high', 'menominee': 'rural high', 'oneida': 'rural high', 'outagamie': '

In [68]:
# make a column of the m1 urb and rur, high and low vax categories
seq_df['m1_urbrur_hilo'] = seq_df.index.map(m1county_category_dict)
seq_df['m2_urbrur_hilo'] = seq_df.index.map(m2county_category_dict)

# Uncomment to save the dataframe to an output file
#seq_output = seq_df.to_csv("/Users/mavoeg/Desktop/counts-gis.csv")


In [69]:
# add sequences grouped by the columns "m1_urbrur", "m2_urbrur", and the corresponding date
m1sum = seq_df.groupby(['m1_urbrur_hilo']).sum()
m2sum = seq_df.groupby(['m2_urbrur_hilo']).sum()

#transpose the columns and rows so that the date is the index column
m1sum_by_group = m1sum.transpose()
m2sum_by_group = m2sum.transpose()

# Rename the index name to "date"
m1sum_by_group = m1sum_by_group.rename_axis("date")
m2sum_by_group = m2sum_by_group.rename_axis("date")

m1sum_by_group = m1sum_by_group.rename(columns={"urban high": "m1_urb_hi", "rural high": "m1_rur_hi", "urban low": "m1_urb_low", "rural low": "m1_rur_low"})
m2sum_by_group = m2sum_by_group.rename(columns={"urban high": "m2_urb_hi", "rural high": "m2_rur_hi", "urban low": "m2_urb_low", "rural low": "m2_rur_low"})

# Combine m1sum_by_group and m2sum_by_group DataFrames
m1m2_vaxcatgroups = pd.concat([m1sum_by_group, m2sum_by_group], axis=1)

  m1sum = seq_df.groupby(['m1_urbrur_hilo']).sum()
  m2sum = seq_df.groupby(['m2_urbrur_hilo']).sum()


In [None]:

# Uncomment to export the combined DataFrame to a CSV file
#output_csv = "/Users/mavoeg/Desktop/SARS/Wisconsin/WI_Data_Counties/VaxData/Vax_frompy/2023_07_23_m1m2-CDCseqs-urbrur-hilo.csv"
#m1m2_vaxcatgroups.to_csv(output_csv)