In [1]:
#!/usr/bin/env python
import pandas as pd
from sodapy import Socrata

client = Socrata("data.cdc.gov", None)

# Fetch variant names, the share column, and the week_ending column from the API
results = client.get("jr58-6ysp", content_type="json", select="variant, share, week_ending")

# Convert to pandas DataFrame and extract unique prefixes
results_df = pd.DataFrame.from_records(results)
results_df['share'] = results_df['share'].astype(float) * 100  # Multiply by 100 to convert to percentage
results_df['variant_prefix'] = results_df['variant'].str.split('.').str[0]

# Compute total share for each unique prefix and week
total_shares_per_week = results_df.groupby(['variant_prefix', 'week_ending'])['share'].sum().reset_index()

# Fetch the count for each unique prefix and week combination
counts = []
for _, row in total_shares_per_week.iterrows():
    prefix = row['variant_prefix']
    week = row['week_ending']
    count_results = client.get("jr58-6ysp", content_type="json", where=f"variant LIKE '{prefix}.%' AND week_ending='{week}'", select="COUNT(variant) AS variant_count")
    counts.append({
        'variant_prefix': prefix,
        'week_ending': week,
        'count': count_results[0]["variant_count"]
    })

# Convert the counts into a pandas DataFrame
count_df = pd.DataFrame(counts)
count_df = count_df[count_df['variant_prefix'] != 'Other'].reset_index(drop=True)

# Merge count_df with total_shares_per_week on variant_prefix and week_ending
final_df = pd.merge(count_df, total_shares_per_week, on=['variant_prefix', 'week_ending'])

# Rename columns
final_df = final_df.rename(columns={'variant_prefix': 'Variant Type', 'count': 'Count'})

# Format the 'share' column to show % sign
final_df['Total Share Percentage'] = final_df['share'].map('{:.2f}%'.format)

# Drop the unformatted 'share' column
final_df.drop('share', axis=1, inplace=True)

print(final_df)





In [2]:
# Convert the appropriate columns to python datetime
results_df['week_ending'] = pd.to_datetime(results_df['week_ending'], format='%Y-%m-%dT%H:%M:%S.%f')
results_df['creation_date'] = pd.to_datetime(results_df['creation_date'], format='%Y-%m-%dT%H:%M:%S.%f')


In [3]:
# rename chare column then convert to floats 
results_df.rename(columns={'share' : 'Share (%)'}, inplace=True)
results_df['Share (%)'] = results_df['Share (%)'].astype(float) * 100
results_df.head(5)

Unnamed: 0,usa_or_hhsregion,week_ending,variant,Share (%),share_hi,share_lo,count_lt10,modeltype,time_interval,creation_date
0,1,2023-02-25,CH.1.1,0.261089,0.0046103517524898,0.0014449068112298,,smoothed,weekly,2023-03-10
1,1,2023-02-25,BN.1,0.046195,0.0006514294072985,0.000325947854435,,smoothed,weekly,2023-03-10
2,1,2023-02-25,XBB.1.5.1,1.951178,0.0329358167946338,0.0112925367429852,,smoothed,weekly,2023-03-10
3,1,2023-02-25,BA.5.2.6,0.006352,9.09059381228e-05,4.413523492985405e-05,,smoothed,weekly,2023-03-10
4,1,2023-02-25,BA.2.75,0.003724,7.89429541327e-05,1.6694790247129276e-05,,smoothed,weekly,2023-03-10


In [4]:
results_df.tail()

Unnamed: 0,usa_or_hhsregion,week_ending,variant,Share (%),share_hi,share_lo,count_lt10,modeltype,time_interval,creation_date
1995,7,2023-04-15,BN.1,0.020911,0.0003379673580639,0.0001276578259421,,smoothed,weekly,2023-04-28
1996,7,2023-04-15,BA.2,0.002046,9.0587302111e-05,2.951019951069611e-06,,smoothed,weekly,2023-04-28
1997,7,2023-04-15,BF.7,0.003138,6.4325700805e-05,1.4647738680650946e-05,,smoothed,weekly,2023-04-28
1998,7,2023-04-15,BA.1.1,0.006108,0.0014895992353558,0.0,,smoothed,weekly,2023-04-28
1999,7,2023-04-15,BA.2.75,0.003924,7.29736566427e-05,2.050479088211432e-05,,smoothed,weekly,2023-04-28
