In [136]:
import pandas as pd
import numpy as np

In [137]:
cases = pd.read_csv("dataset/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")

In [138]:
cases["submission_date"] = pd.to_datetime(cases["submission_date"])

In [139]:
georgia_df = cases[cases["state"].isin(['GA'])].sort_values(by = "submission_date").set_index("submission_date")
georgia_case_death = georgia_df[["new_case", "new_death"]]

In [140]:
indiana_df = cases[cases["state"].isin(['IN'])].sort_values(by = "submission_date").set_index("submission_date")
indiana_case_death = indiana_df[["new_case", "new_death"]]

In [141]:
def tukey_range(values):
    
    alpha = 1.5
    sorted_values = sorted(values)
    
    # Q1
    q1_index = int(np.ceil(0.25 * len(values)))
    q1 = sorted_values[q1_index]
    
    # Q2
    q3_index = int(np.ceil(0.75 * len(values)))
    q3 = sorted_values[q3_index]
    
    # IQR
    iqr = q3 - q1
    
    return q1 - (alpha * iqr), q3 + (alpha * iqr)

In [142]:
def clean_data(series):
    
    #Check if any NA values
    print("Missing Values:{}\n".format(len(df[col][df[col].isna()])))
    
    invalid_entries = series[series < 0].values
    print("Invalid entries: {}\n".format(invalid_entries))
    
    series = series[series >=0]
    lower_limit, upper_limit = tukey_range(series.values)
    print("Lower Range: {}, Upper Range: {}\n".format(lower_limit, upper_limit))
    
    outliers = list(series[series < lower_limit].values) + list(series[series > upper_limit].values)
    print("Total Outliers: {}\n".format(len(outliers)))
    print("Outliers: \n{}\n".format(outliers))
    
    outliers_index = list(series[series < lower_limit].index) + list(series[series > upper_limit].index)
    return series.drop(index = outliers_index)

In [143]:
dfs = {"Georgia":georgia_case_death, "Indiana":indiana_case_death}
res = []
for state, df in dfs.items():
    print("State:{}".format(state))
    print("-------------------------------------------------")
    print("-------------------------------------------------")
    s = []
    for col in df.columns:
        print("Feature: {}".format(col))
        print("----------------------------------")
        clean_series = clean_data(df[col])
        # clean_series.index = df.index
        s.append(clean_series)
    res.append(pd.concat(s, axis = 1))
georgia, indiana = [r.dropna(subset = ["new_case", "new_death"]) for r in res]

State:Georgia
-------------------------------------------------
-------------------------------------------------
Feature: new_case
----------------------------------
Missing Values:0

Invalid entries: [-5]

Lower Range: -3912.5, Upper Range: 8019.5

Total Outliers: 68

Outliers: 
[8141, 9079, 10286, 9450, 9053, 11709, 11137, 10091, 9790, 13296, 11926, 8193, 9193, 8596, 9036, 9806, 8533, 8205, 8150, 8374, 8985, 8393, 9836, 9581, 9186, 10677, 10823, 11084, 10521, 9702, 8412, 12018, 8792, 8278, 9089, 10012, 9495, 26279, 13018, 19124, 24420, 23438, 47436, 17603, 24024, 23813, 26033, 46474, 24865, 18939, 24310, 19943, 35149, 12071, 27781, 22684, 18671, 42786, 16232, 17332, 21708, 18785, 21534, 10226, 8020, 8029, 8798, 11804]

Feature: new_death
----------------------------------
Missing Values:0

Invalid entries: [ -2  -2 -14 -91  -1  -1]

Lower Range: -81.0, Upper Range: 151.0

Total Outliers: 39

Outliers: 
[157, 187, 172, 163, 153, 222, 196, 171, 186, 179, 161, 184, 159, 210, 208, 169, 

In [147]:
georgia.reset_index().to_csv("georgia_cases_deaths.csv", index = False)
indiana.reset_index().to_csv("indiana_cases_deaths.csv", index = False)