In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
import plotly as py
from plotly import graph_objs as go
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter

In [2]:
data = pd.read_csv('2024-01-29_dhs_gisaid_merge_LD.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22643 entries, 0 to 22642
Data columns (total 68 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Unnamed: 0                       22643 non-null  int64  
 1   GISAID_Name                      22643 non-null  object 
 2   SampleID                         22643 non-null  object 
 3   WEDSS_DOC                        22643 non-null  object 
 4   WSLH_DOC                         7757 non-null   object 
 5   MHDL_DOC                         5532 non-null   object 
 6   CDC_DOC                          455 non-null    object 
 7   AVRL_DOC                         8899 non-null   object 
 8   ClientID                         22643 non-null  int64  
 9   Age                              22643 non-null  int64  
 10  Gender.x                         22627 non-null  object 
 11  Race                             22611 non-null  object 
 12  Ethnicity         

  data = pd.read_csv('2024-01-29_dhs_gisaid_merge_LD.csv')


In [None]:
#some dupe entries =, ~200ish
data = data.drop_duplicates(subset=['SampleID', 'ClientID'])
data.info()

In [None]:

# Convert multiple date columns to datetime objects
date_cols = ['WEDSS_DOC', 'VacDate1', 'VacDate2', 'VacDate3','VacDate4','VacDate5','VacDate6']
data[date_cols] = data[date_cols].apply(pd.to_datetime, errors='coerce')

# Format date columns in specific format '%Y-%m-%d'
data[date_cols] = data[date_cols].apply(lambda x: x.dt.strftime('%Y-%m-%d'))

sub_data =(data[date_cols])
sub_data
#temp_df.info()

In [None]:
#loop to remove irrelevant vax dates that are after sample (vaxdates that are after WEDSS_DOC)
for col in date_cols[1:]:
    sub_data.loc[sub_data['WEDSS_DOC'] < sub_data[col], col] = pd.NaT

sub_data
#temp_df.head(50)
#temp_df['VacDate4'].unique()

In [None]:
#find most recent date in each row and put it in a new column "most_recent"
for index, row in sub_data.iterrows():
    valid_dates = [date for date in row[1:] if not (pd.isnull(date) or date == pd.NaT)]
    if valid_dates:
        sub_data.loc[index, 'most_recent'] = max(valid_dates)
    else:
        sub_data.loc[index, 'most_recent'] = np.nan

sub_data.head(50)
#temp_df.head(25)

In [None]:
#now making a comparison function, first have to deal with NaN values, then do comparison for actual dates
#combine years and months comparisons to get total in months, apply this result to new 'diff' column

sub_data['WEDSS_DOC'] = pd.to_datetime(sub_data['WEDSS_DOC'])
sub_data['most_recent'] = pd.to_datetime(sub_data['most_recent'])

# Calculate the difference in months between 'start' and 'end' and store it in a new column 'res'
def calculate_month_difference(row):
    if pd.isnull(row['WEDSS_DOC']) or pd.isnull(row['most_recent']):
        return np.nan
    else:
        return (row['WEDSS_DOC'].year - row['most_recent'].year) * 12 + (row['WEDSS_DOC'].month - row['most_recent'].month)

sub_data['diff'] = sub_data.apply(calculate_month_difference, axis=1)

sub_data.head(25)
#temp_df.head(25)

In [None]:
#now deteriming if more than 6mos since most recent vax and adding that result to new column

sub_data['more_than_6mos'] = np.where(sub_data['diff'] > 6, True,
                                     np.where(sub_data['diff'] <= 6, False, np.nan))
sub_data.head(25)
#temp_df.head(25)

In [None]:
#creating a column for count of # of vaccinations

vax_cols = [1, 2, 3, 4, 5, 6]  #column index positions

# Calculate the count of non-null values across specific columns by index positions and store the sum in a new 'count' column
sub_data['vax_count'] = sub_data.iloc[:, vax_cols].count(axis=1)

sub_data.head(25)
#temp_df.head(50)

In [None]:
counts= sub_data['vax_count'].value_counts()
print(counts)

In [None]:
sub_data['1dose_or_more'] = np.where(sub_data['vax_count'] > 0, True, False)
sub_data.head(25)


In [None]:
#merging the newly made columns onto the full dataframe by index position that should not have changed

data1 = pd.merge(data, sub_data[['most_recent','diff','more_than_6mos', 'vax_count', '1dose_or_more']], 
                 left_index=True, right_index=True)

data1.info()

In [None]:
#dropping a bunch of spec_collected_date columns that aren't needed and make df unwieldy

cols = [30, 31, 32, 33, 34 ,35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 64]
data1.drop(data1.columns[cols],axis=1,inplace=True)
data1.info()
data1.head(25)

In [None]:
#reading in FIPS county spreadsheet and merging it onto df, exporting this new updaded file

county = pd.read_csv('county_FIPS.tsv', sep='\t')
data2 = pd.merge(data1, county, how='left', on='CountyFIPS')
data2.head(25)
data2.info()
data2.to_csv('2024-01-30_meta_AJ-columns-added.csv')

In [None]:
#making a data subset for dane county only for some visualizations looking at vax rates in Dane

data_dane = data2[data2['CountyFIPS'] == 55025]
print(len(data_dane))

data_dane.info()
data_dane.head(25)

In [None]:
#format date column to date object and then sort by dates so we can see vax rates over time

data_dane['WEDSS_DOC'] = pd.to_datetime(data_dane['WEDSS_DOC'])
data_dane = data_dane.sort_values(by='WEDSS_DOC')


In [None]:
# Group by 'date' and calculate the cumulative percentage of True values, where there is 1 vax dose
grouped_df = data_dane.groupby('WEDSS_DOC')['1dose_or_more'].agg(['sum', 'count'])
grouped_df['Vax_Rate_1dose'] = (grouped_df['sum'].cumsum() / grouped_df['count'].cumsum())*100

# Reset the index for a cleaner dataframe
grouped_df = grouped_df.reset_index()

# Display the resulting grouped dataframe
print(grouped_df)



In [None]:
#visualizing vax rates over time with this cumulative vax dataframe

grouped_df['WEDSS_DOC'] = pd.to_datetime(grouped_df['WEDSS_DOC'])

fig = px.line(grouped_df, x='WEDSS_DOC', y='Vax_Rate_1dose',
              title='Vaccination Rate in Dane County, 1+ dose')

fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y")
fig.show()

In [None]:
#repeat for MKE county
data_mke = data2[data2['CountyFIPS'] == 55079]


In [None]:
data_mke['WEDSS_DOC'] = pd.to_datetime(data_mke['WEDSS_DOC'])
data_mke = data_mke.sort_values(by='WEDSS_DOC')


In [None]:
# Group by 'date' and calculate the cumulative percentage of True values, where there is 1 vax dose
grouped_df2 = data_mke.groupby('WEDSS_DOC')['1dose_or_more'].agg(['sum', 'count'])
grouped_df2['Vax_Rate_1dose'] = (grouped_df2['sum'].cumsum() / grouped_df2['count'].cumsum())*100

# Reset the index for a cleaner dataframe
grouped_df2 = grouped_df2.reset_index()

# Display the resulting grouped dataframe
print(grouped_df2)


In [None]:
grouped_df2['WEDSS_DOC'] = pd.to_datetime(grouped_df2['WEDSS_DOC'])

fig2 = px.line(grouped_df2, x='WEDSS_DOC', y='Vax_Rate_1dose',
              title='Vaccination Rate in Milwaukee County, 1+ dose')

#fig.add_trace(go.Scatter(x = grouped_df['WEDSS_DOC'], y = grouped_df['cumulative_percentage'], name = 'MKE'))


fig2.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y")
fig2.show()

In [None]:
#now doing the same with the vax rate data from CDC so we can compare to our data

wi_vax = pd.read_csv('WI_covid_vax_stats.tsv', sep ='\t')
wi_vax['Date'] = pd.to_datetime(wi_vax['Date']).dt.strftime('%Y-%m-%d')
wi_vax

In [None]:
dane_vax = wi_vax.loc[wi_vax['FIPS'] == '55025']
dane_vax = dane_vax.rename(columns={'Administered_Dose1_Pop_Pct': 'Vax_Rate_1dose'})
dane_vax

In [None]:
mke_vax = wi_vax.loc[wi_vax['FIPS'] == '55079']
mke_vax = mke_vax.rename(columns={'Administered_Dose1_Pop_Pct': 'Vax_Rate_1dose'})
mke_vax

In [None]:

fig3 = px.line(dane_vax, x="Date", y='Vax_Rate_1dose',
              hover_data={"Date": "|%B %d, %Y"},
              title='Vaccination Rate in Dane County, 1+ dose')
fig3.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y")
fig3.show()

In [None]:
# Create traces for each dataframe
trace1 = go.Scatter(x=dane_vax['Date'], y=dane_vax['Vax_Rate_1dose'], mode='lines', name='vax_stats, Dane')
trace2 = go.Scatter(x=grouped_df['WEDSS_DOC'], y=grouped_df['Vax_Rate_1dose'], mode='lines', name='dataset, Dane')

# Create the layout for the plot
layout = go.Layout(title='Vaccination Rate in Dane County, 1+ dose',
                   xaxis=dict(title='Date'),
                   yaxis=dict(title='Percent Vaccinated (%)'))

# Combine traces into a list
fig_data = [trace1, trace2]

# Create the figure
fig_dane = go.Figure(data=fig_data, layout=layout)

# Show the figure
fig_dane.show()

fig_dane.write_html("data_summary_figs/2024-01-16_dane_vax_stats.html")


In [None]:
# Create traces for each dataframe
trace1 = go.Scatter(x=dane_vax['Date'], y=dane_vax['Vax_Rate_1dose'], mode='lines', name='vax_stats, Dane')
trace3 = go.Scatter(x=grouped_df['WEDSS_DOC'], y=grouped_df['Vax_Rate_1dose'], mode='lines', name='dataset, Dane')
trace4 = go.Scatter(x=grouped_df2['WEDSS_DOC'], y=grouped_df2['Vax_Rate_1dose'], mode= 'lines', name='dataset, MKE')
trace2 = go.Scatter(x=mke_vax['Date'], y=mke_vax['Vax_Rate_1dose'], mode= 'lines', name='vax_stats, MKE')


# Create the layout for the plot
layout = go.Layout(title='Vaccination Rate in Dane and Milwaukee County, 1+ dose',
                   xaxis=dict(title='Date'),
                   yaxis=dict(title='Percent Vaccinated (%)'))

# Combine traces into a list
fig_data2 = [trace1, trace2, trace3, trace4]

# Create the figure
fig_WI = go.Figure(data=fig_data2, layout=layout)

# Show the figure
fig_WI.show()

In [None]:
#seq count over time by county
fig4 = px.histogram(data2, x='WEDSS_DOC', nbins=43, title='Sequence data over time', color ='County')
fig4.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M3", tickformat="%b\n%Y")

fig4.show()
fig4.write_html("data_summary_figs/2024-01-17_seqs_by_county.html")


In [None]:
#histogram of samples by # of vaccines, can isolate each count's data by double click on legend
fig5 = px.histogram(data2, x='WEDSS_DOC', nbins=43, title='Sequence data over time by vax status', color ='vax_count')
fig5.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M3", tickformat="%b\n%Y")

fig5.show()

In [None]:
#histogram of samples over time 
fig6 = px.histogram(data2, x='WEDSS_DOC', nbins=43, title='Sequence data over time', color ='Gender.x')
fig6.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M3", tickformat="%b\n%Y")

fig6.show()

In [None]:
#histogram of samples by # of vaccines, can isolate each count's data by double click on legend
fig6 = px.histogram(data2, x='WEDSS_DOC', nbins=50, title='Sequence data by clade', color ='Clade')
fig6.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M3", tickformat="%b\n%Y")

fig6.show()