In [2]:
# Nationwide
import pandas as pd
import requests
from datetime import datetime, timedelta
from math import log10, floor

# Read the CSV into a pandas DataFrame, making sure to parse the first column as dates
# Specify the correct date format if pandas does not recognize it automatically
df = pd.read_csv("US_Biobot_nationwide_data_2024-04-29.csv", skiprows=2, parse_dates=['Date'], usecols=[0, 1, 3], 
                 names=['Date', 'Location', 'Concentration'], 
                 date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'))

# Clean non-numeric characters from the concentration column if necessary
# For example, if there are commas in the numbers or there are strings like '<1'
df['Concentration'] = pd.to_numeric(df['Concentration'].replace('[^0-9.]', '', regex=True), errors='coerce')

# Filter the DataFrame for the 'Nationwide' location
nationwide_df = df[df['Location'] == 'Nationwide'].sort_values('Date')

# Interpolate the missing dates
biob = nationwide_df.set_index('Date').resample('D').interpolate().reset_index()
biob = biob[['Date', 'Concentration']]
biob['Date'] = pd.to_datetime(biob['Date'])

biob['estimated_infections'] = biob['Concentration'] * 915.6749186924305

# Define a function to round a number to two significant digits
def round_to_two_significant_digits(num):
    if num == 0:
        return 0
    else:
        # Calculate the number of digits to round to
        round_digits = -int(floor(log10(abs(num)))) + 1
        # Round the number
        return round(num, round_digits)

# Apply the rounding function to the relevant columns
biob['Concentration'] = biob['Concentration'].apply(round_to_two_significant_digits)

rows_list = []

for index, row in biob.iterrows():
        rows_list.append({'Country': 'United_States', 'Region': 'Nationwide', 'Date': row['Date'], 'Measure': 'inf', 'Value': row['estimated_infections']})
        rows_list.append({'Country': 'United_States', 'Region': 'Nationwide', 'Date': row['Date'], 'Measure': 'wastewater', 'Value': row['Concentration']})

combined_df = pd.concat([pd.DataFrame(rows_list)], ignore_index=True)
states_df = pd.read_csv('US_states_min.csv')

final_combined_df = pd.concat([combined_df, states_df], ignore_index=True)

final_combined_df.to_json('United_States_states_min.json', orient='records', date_format='iso')
final_combined_df.to_csv('United_States_states_min.csv', index=False)
print(final_combined_df)

              Country      Region                 Date     Measure  \
0       United_States  Nationwide  2020-03-07 00:00:00         inf   
1       United_States  Nationwide  2020-03-07 00:00:00  wastewater   
2       United_States  Nationwide  2020-03-08 00:00:00         inf   
3       United_States  Nationwide  2020-03-08 00:00:00  wastewater   
4       United_States  Nationwide  2020-03-09 00:00:00         inf   
...               ...         ...                  ...         ...   
128789  United_States          AK           2024-04-18  wastewater   
128790  United_States          AK           2024-04-19         inf   
128791  United_States          AK           2024-04-19  wastewater   
128792  United_States          AK           2024-04-20         inf   
128793  United_States          AK           2024-04-20  wastewater   

              Value  
0       2497.251521  
1          2.700000  
2       5295.849118  
3          5.800000  
4       8094.446716  
...             ...  
12878

           Date  Concentration
0    2020-03-07       2.727225
1    2020-03-08       5.783547
2    2020-03-09       8.839869
3    2020-03-10      11.896192
4    2020-03-11      14.952514
...         ...            ...
1473 2024-03-19     435.058070
1474 2024-03-20     420.137354
1475 2024-03-21     405.216637
1476 2024-03-22     390.295921
1477 2024-03-23     375.375205

[1478 rows x 2 columns]


            Country      Region       Date     Measure          Value
0     United_States  Nationwide 2020-03-07         inf    2497.251521
1     United_States  Nationwide 2020-03-07  wastewater       2.700000
2     United_States  Nationwide 2020-03-08         inf    5295.849118
3     United_States  Nationwide 2020-03-08  wastewater       5.800000
4     United_States  Nationwide 2020-03-09         inf    8094.446716
...             ...         ...        ...         ...            ...
2951  United_States  Nationwide 2024-03-21  wastewater     410.000000
2952  United_States  Nationwide 2024-03-22         inf  357384.185789
2953  United_States  Nationwide 2024-03-22  wastewater     390.000000
2954  United_States  Nationwide 2024-03-23         inf  343721.660173
2955  United_States  Nationwide 2024-03-23  wastewater     380.000000

[2956 rows x 5 columns]
