In [10]:
import json
import pandas as pd
from io import StringIO

# --- OPTION A: read from URL (works in a normal Python environment) ---
URL = "https://raw.githubusercontent.com/gauravfs-14/measles-dashboard/refs/heads/master/src/data/json/cases_over_time.json"
use_url = True  # set to False to use OPTION B below

import urllib.request, ssl
ctx = ssl.create_default_context()
with urllib.request.urlopen(URL, context=ctx) as resp:
    data = json.loads(resp.read().decode("utf-8"))
# ---- Build long table: (date, county, case) ----
rows = []
for item in data:
    county = item["county"]
    for rec in item["cases"]:
        rows.append({"date": rec["date"], "county": county, "case": rec["case"]})

long_df = pd.DataFrame(rows)

# Ensure date is proper and sorted
long_df["date"] = pd.to_datetime(long_df["date"])
long_df = long_df.sort_values(["date", "county"])

# ---- Pivot to wide: rows = dates, cols = counties ----
wide_df = long_df.pivot_table(
    index="date",
    columns="county",
    values="case",
    aggfunc="first"  # in case duplicates exist, take the first
)

# Optional: sort columns alphabetically
wide_df = wide_df.sort_index(axis=1)

# Optional: format date as YYYY-MM-DD string for Excel-friendliness
out = wide_df.copy()
out.index = out.index.strftime("%Y-%m-%d")

# Save and also show a small preview
csv_path = "time_series_county_cases.csv"
out.to_csv(csv_path, index_label="date")

print("Saved:", csv_path)
print(out.head(10).to_string())  # preview first few rows


Saved: time_series_county_cases.csv
county      Andrews, TX  Atascosa, TX  Bailey, TX  Bell, TX  Bexar, TX  Borden, TX  Brazoria, TX  Brewster, TX  Brown, TX  Carson, TX  Cochran, TX  Collin, TX  Dallam, TX  Dallas, TX  Dawson, TX  Denton, TX  Eastland, TX  Ector, TX  El Paso, TX  Erath, TX  Fannin, TX  Fort Bend, TX  Gaines, TX  Garza, TX  Hale, TX  Hardeman, TX  Harris, TX  Harrison, TX  Hays, TX  Hockley, TX  Lamar, TX  Lamb, TX  Lubbock, TX  Lynn, TX  Martin, TX  McLennan, TX  Midland, TX  Parmer, TX  Potter, TX  Randall, TX  Reeves, TX  Rockwall, TX  Scurry, TX  Shackelford, TX  Tarrant, TX  Terry, TX  Travis, TX  Upshur, TX  Williamson, TX  Yoakum, TX
date                                                                                                                                                                                                                                                                                                                                          

In [11]:
import json
import urllib.request, ssl

URL = "https://raw.githubusercontent.com/gauravfs-14/measles-dashboard/refs/heads/master/src/data/json/cases_over_time.json"

ctx = ssl.create_default_context()
with urllib.request.urlopen(URL, context=ctx) as resp:
    data = json.loads(resp.read().decode("utf-8"))

# extract sorted list of unique counties
counties = sorted({item["county"] for item in data})

# print vertical list
print("\n".join(counties))

# OR, if you want to copy horizontally for Excel:
print("\t".join(counties))


Andrews, TX
Atascosa, TX
Bailey, TX
Bell, TX
Bexar, TX
Borden, TX
Brazoria, TX
Brewster, TX
Brown, TX
Carson, TX
Cochran, TX
Collin, TX
Dallam, TX
Dallas, TX
Dawson, TX
Denton, TX
Eastland, TX
Ector, TX
El Paso, TX
Erath, TX
Fannin, TX
Fort Bend, TX
Gaines, TX
Garza, TX
Hale, TX
Hardeman, TX
Harris, TX
Harrison, TX
Hays, TX
Hockley, TX
Lamar, TX
Lamb, TX
Lubbock, TX
Lynn, TX
Martin, TX
McLennan, TX
Midland, TX
Parmer, TX
Potter, TX
Randall, TX
Reeves, TX
Rockwall, TX
Scurry, TX
Shackelford, TX
Tarrant, TX
Terry, TX
Travis, TX
Upshur, TX
Williamson, TX
Yoakum, TX
Andrews, TX	Atascosa, TX	Bailey, TX	Bell, TX	Bexar, TX	Borden, TX	Brazoria, TX	Brewster, TX	Brown, TX	Carson, TX	Cochran, TX	Collin, TX	Dallam, TX	Dallas, TX	Dawson, TX	Denton, TX	Eastland, TX	Ector, TX	El Paso, TX	Erath, TX	Fannin, TX	Fort Bend, TX	Gaines, TX	Garza, TX	Hale, TX	Hardeman, TX	Harris, TX	Harrison, TX	Hays, TX	Hockley, TX	Lamar, TX	Lamb, TX	Lubbock, TX	Lynn, TX	Martin, TX	McLennan, TX	Midland, TX	Parmer, TX	Potter

In [12]:
import pandas as pd
from pathlib import Path

path = Path(r"C:\Users\msnin\Downloads\Oden\Oden-github\Measles Model\Datasets Used\INFECTIONS BACKUP.csv")
df = pd.read_csv(path)

# 1. Parse dates
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date']).sort_values('date')

# 2. Create continuous date range
full_dates = pd.date_range(df['date'].min(), df['date'].max(), freq='D')
full_df = pd.DataFrame({'date': full_dates})

# 3. Merge with your original infection data
merged = full_df.merge(df, on='date', how='left')

# 4. Fill missing county values with 0
county_cols = [c for c in merged.columns if c != 'date']
merged[county_cols] = merged[county_cols].fillna(0).astype(int)

# 5. Save for future modeling
out = Path(r"C:\Users\msnin\Downloads\Oden\Oden-github\Measles Model\Datasets Used\infection_county_cases.csv")
merged.to_csv(out, index=False)
print(f"Saved expanded daily file: {out}")


Saved expanded daily file: C:\Users\msnin\Downloads\Oden\Oden-github\Measles Model\Datasets Used\infection_county_cases.csv


In [13]:
import pandas as pd
from datetime import timedelta
from pathlib import Path

path = Path(r"C:\Users\msnin\Downloads\Oden\Oden-github\Measles Model\time_series_county_cases.csv")

def make_recoveries_from_infections(
    infections_csv: str,
    out_csv: str,
    shift_days: int = 10,
    exceptions: list | None = None,
    right_censor_to_last: bool = True,
):
    """
    Create a recoveries dataset by shifting infections forward by `shift_days`.

    Parameters
    ----------
    infections_csv : str
        Path to the infections CSV (wide: 'date' + county columns). Must include every day.
    out_csv : str
        Path to write the recoveries CSV (wide).
    shift_days : int
        Number of days to shift infections forward to represent recovery.
    exceptions : list[(date, county, subtract_n)]
        e.g., [("2025-03-04","Gaines",1), ("2025-04-04","Gaines",1)]
    right_censor_to_last : bool
        If True, if (infection date + shift_days) exceeds the last date, assign to last date.
        If False, drop those (not closed).
    """
    # path = Path(infections_csv)
    df = pd.read_csv(path)
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date']).sort_values('date').reset_index(drop=True)
    county_cols = [c for c in df.columns if c != 'date']
    for c in county_cols:
        df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int)

    min_d, max_d = df['date'].min(), df['date'].max()

    # Long infections
    long_inf = df.melt(id_vars='date', var_name='county', value_name='infected')
    long_inf = long_inf[long_inf['infected'] > 0].copy()

    def map_recovery_date(d):
        rec = d + timedelta(days=shift_days)
        if rec <= max_d:
            return rec
        return max_d if right_censor_to_last else pd.NaT

    long_inf['recovery_date'] = long_inf['date'].apply(map_recovery_date)
    if not right_censor_to_last:
        long_inf = long_inf.dropna(subset=['recovery_date'])

    recovered_long = (long_inf
                      .groupby(['recovery_date','county'], as_index=False)['infected']
                      .sum()
                      .rename(columns={'infected':'recovered'}))

    # Apply exceptions (subtract from recovery on (inf_date + shift_days))
    if exceptions:
        for exc_date, county_name, subtract_n in exceptions:
            exc_date = pd.to_datetime(exc_date)
            rec_date = exc_date + timedelta(days=shift_days)
            if rec_date > max_d and right_censor_to_last:
                rec_date = max_d
            if rec_date < min_d or rec_date > max_d:
                continue
            mask = (recovered_long['recovery_date']==rec_date) & (recovered_long['county']==county_name)
            if not mask.any():
                recovered_long = pd.concat([
                    recovered_long,
                    pd.DataFrame([{'recovery_date': rec_date, 'county': county_name, 'recovered': -subtract_n}])
                ], ignore_index=True)
            else:
                recovered_long.loc[mask, 'recovered'] = recovered_long.loc[mask, 'recovered'] - subtract_n

    # Back to wide aligned to original date index
    recovered_wide = (recovered_long
                      .pivot(index='recovery_date', columns='county', values='recovered')
                      .fillna(0).astype(int)
                      .reindex(df['date'].unique(), fill_value=0)
                      .rename_axis('date')
                      .reset_index())

    # Keep original columns/order
    for c in county_cols:
        if c not in recovered_wide.columns:
            recovered_wide[c] = 0
    recovered_wide = recovered_wide[['date'] + county_cols]
    recovered_wide[county_cols] = recovered_wide[county_cols].astype(int)

    Path(out_csv).parent.mkdir(parents=True, exist_ok=True)
    recovered_wide.to_csv(out_csv, index=False)


In [14]:
# 10-day recoveries (your case)
make_recoveries_from_infections(
    infections_csv=r"C:\Users\msnin\Downloads\Oden\Oden-github\Measles Model\time_series_county_cases.csv",
    out_csv=r"C:\Users\msnin\Downloads\Oden\Oden-github\Measles Model\recov_og_county_cases.csv",
    shift_days=10,
    exceptions=[("2025-03-04","Gaines",1), ("2025-04-04","Gaines",1)],
    right_censor_to_last=True
)

creating raw weekly infection county data by aggregating each week

In [33]:
import pandas as pd

# File paths
input_path = r"C:\Users\msnin\Downloads\Oden\Oden-github\Measles Model\Datasets Used\infection_county_cases.csv"
output_path = r"C:\Users\msnin\Downloads\Oden\Oden-github\Measles Model\Datasets Used\weekly_infection_county_cases.csv"

# Read raw daily data
df = pd.read_csv(input_path)

# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# Set date as index
df.set_index('date', inplace=True)

# Resample weekly and sum all columns (all counties)
weekly_df = df.resample('W').sum()

# Reset index to return date as a column
weekly_df = weekly_df.reset_index()

# Save weekly aggregated data to CSV
weekly_df.to_csv(output_path, index=False)

print(f"Weekly aggregated infections saved to {output_path}")


Weekly aggregated infections saved to C:\Users\msnin\Downloads\Oden\Oden-github\Measles Model\Datasets Used\weekly_infection_county_cases.csv


aggregate data every 4 days

In [36]:
import pandas as pd

# File paths
input_path = r"C:\Users\msnin\Downloads\Oden\Oden-github\Measles Model\Datasets Used\death_by_county.csv"
output_path = r"C:\Users\msnin\Downloads\Oden\Oden-github\Measles Model\Datasets Used\twice_weekly_death_county_cases.csv"

# Read raw daily data
df = pd.read_csv(input_path)

# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# Set date as index
df.set_index('date', inplace=True)

# Resample approx twice weekly (every 3 days)
twice_weekly_df = df.resample('4D').sum()

# Reset index to return date as a column
twice_weekly_df = twice_weekly_df.reset_index()

# Save twice weekly aggregated data to CSV
twice_weekly_df.to_csv(output_path, index=False)

print(f"Twice weekly aggregated infections saved to {output_path}")


Twice weekly aggregated infections saved to C:\Users\msnin\Downloads\Oden\Oden-github\Measles Model\Datasets Used\twice_weekly_death_county_cases.csv
