# COVID-19 Variant Analysis

## Clean the Data

In [1]:
#!/usr/bin/env python
import pandas as pd
from sodapy import Socrata
from matplotlib import pyplot as plt

client = Socrata("data.cdc.gov", None)

# Initialize empty list to store all results
all_results = []

# Parameters for pagination
offset = 0
limit = 10000

while True:
    # Fetch a chunk of data
    results_chunk = client.get("jr58-6ysp", content_type="json", select="variant, creation_date, share, week_ending, usa_or_hhsregion", limit=limit, offset=offset)
    
    # If chunk is empty or smaller than limit, stop fetching
    if not results_chunk or len(results_chunk) < limit:
        all_results.extend(results_chunk)
        break
    
    # Otherwise, add chunk to all results and increment the offset
    all_results.extend(results_chunk)
    # Print running statement to check if we are looping through the API
    print('running...')
    offset += limit

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(all_results)

# Display tail to make sure we get max rows
results_df.tail(5)



running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...
running...

Unnamed: 0,variant,creation_date,share,week_ending,usa_or_hhsregion
1078797,BN.1,2022-12-09T00:00:00.000,0.0235941037535667,2022-11-19T00:00:00.000,10
1078798,BQ.1,2022-12-09T00:00:00.000,0.2020025104284286,2022-11-19T00:00:00.000,10
1078799,BQ.1.1,2022-12-09T00:00:00.000,0.1741567105054855,2022-11-19T00:00:00.000,10
1078800,Other,2022-12-09T00:00:00.000,0.0,2022-11-19T00:00:00.000,10
1078801,XBB,2022-12-09T00:00:00.000,0.028597155585885,2022-11-19T00:00:00.000,10


In [2]:
# Create a column for the prefix only
results_df['variant_prefix'] = results_df['variant'].str.split('.').str[0]

# Convert the appropriate columns to python datetime
results_df['week_ending'] = pd.to_datetime(results_df['week_ending'], format='%Y-%m-%dT%H:%M:%S.%f')
results_df['creation_date'] = pd.to_datetime(results_df['creation_date'], format='%Y-%m-%dT%H:%M:%S.%f')


In [3]:
# Get all the latest dates for creation_date
latest_dates_df = results_df.groupby(['week_ending', 'usa_or_hhsregion'])['creation_date'].max().reset_index()

# Merge with the original dataframe to get rows with the latest creation_date for each combination
filtered_df = pd.merge(results_df, latest_dates_df, on=['week_ending', 'usa_or_hhsregion', 'creation_date'])

# Reset the index
filtered_df = filtered_df.reset_index(drop=True)



In [4]:
# rename columns and convert the Share column to be in percentage 
filtered_df.rename(columns={'share' : 'Share (%)', 'creation_date' : 'published_date'}, inplace=True)
filtered_df['Share (%)'] = filtered_df['Share (%)'].astype(float) * 100
filtered_df.tail(5)

Unnamed: 0,variant,published_date,Share (%),week_ending,usa_or_hhsregion,variant_prefix
49476,XBB.1.5.72,2023-09-29,1.092002,2023-09-02,USA,XBB
49477,XBB.1.9.1,2023-09-29,3.395495,2023-09-02,USA,XBB
49478,XBB.1.9.2,2023-09-29,1.567945,2023-09-02,USA,XBB
49479,XBB.2.3,2023-09-29,9.131885,2023-09-02,USA,XBB
49480,XBB.2.3.8,2023-09-29,0.294053,2023-09-02,USA,XBB


In [5]:
# Define a dictionary to rename the regions to their actual regional names in case you want to use this later
region_mapping = {
    "1": "Region 1 - Boston",
    "2": "Region 2 - New York",
    "3": "Region 3 - Philadelphia",
    "4": "Region 4 - Atlanta",
    "5": "Region 5 - Chicago",
    "6": "Region 6 - Dallas",
    "7": "Region 7 - Kansas City",
    "8": "Region 8 - Denver",
    "9": "Region 9 - San Francisco",
    "10": "Region 10 - Seattle",
}



False
