In [4]:
# imports 
import numpy as np
import os 
import pandas as pd 
import matplotlib.pyplot as plt
import warnings

# suppress warnings (quite prevalent with pandas and numpy)
warnings.simplefilter("ignore")

pd.options.display.max_rows = 1000

# maintain directories well defined
PROJECT_ROOT_DIR = "."
ALL_DATA_DIR = "dat"
DATA_DIR = "novel-covid-data"
DATA_PATH = os.path.join(PROJECT_ROOT_DIR, ALL_DATA_DIR, DATA_DIR)

# global variables - desired columns from dataset
COLS = ["SNo", "ObservationDate", "Province/State", "Country/Region", "Confirmed", "Deaths"]

# function for initialization
def initialize_data(dataset, data_path=DATA_PATH, cols=COLS):
    csv_path = os.path.join(data_path, dataset)
    data = pd.read_csv(csv_path, usecols=cols)
    return data

# link to data - https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset#covid_19_data.csv
# initialize
data = initialize_data("covid_|19_data.csv")

FileNotFoundError: [Errno 2] File b'./dat/novel-covid-data/covid_|19_data.csv' does not exist: b'./dat/novel-covid-data/covid_|19_data.csv'

In [None]:
indexes = data[data["Province/State"]=="Recovered"].index
data.drop(indexes, inplace = True)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

# unstratified split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
# train_set

NameError: name 'data' is not defined

In [16]:
country_counts = data["Country/Region"].value_counts()
labels = data["Country/Region"].astype('category').cat.categories.tolist()
singles = [i for i in labels if country_counts[i] == 1]
for i in singles:
    indexes = data[data["Country/Region"] == i].index
    data.drop(indexes, inplace = True)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Country/Region"]):
   strat_train_set = data.loc[train_index]
   strat_test_set = data.loc[test_index]

In [None]:
# check sampling ratios for US as comparison
# stratified
print("Stratified Test: ", strat_test_set["Country/Region"].value_counts()["US"]/len(strat_test_set))
print("Stratified Train: ", strat_train_set["Country/Region"].value_counts()["US"]/len(strat_train_set))

# unstratified
print("UnStratified Test: ", test_set["Country/Region"].value_counts()["US"]/len(test_set))
print("UnStratified Train: ", train_set["Country/Region"].value_counts()["US"]/len(train_set))

# original data
print("Original Data: ", data["Country/Region"].value_counts()["US"]/len(data))

In [2]:
covid_data = strat_train_set.copy()
# covid_data

NameError: name 'strat_train_set' is not defined

In [6]:
country_groups = covid_data.groupby("Country/Region")
missing_all = []
missing_none = []
missing_some = []
for k, df in country_groups: 
    if df["Province/State"].isnull().all():
        missing_all.append(k)
    elif df["Province/State"].isnull().any():
        if (~df["Province/State"].isnull()).all():
            missing_none.append(k)
        else: 
            missing_some.append(k)

# print(missing_all, missing_none, missing_some)

In [7]:
def country_to_province(df, col_name="Region"):
    df.loc[df["Province/State"].isnull(), col_name] = df["Country/Region"] 
    df.loc[(~df["Province/State"].isnull()), col_name] = df["Province/State"] 
    df.drop("Country/Region", axis = 1, inplace = True)
    df.drop("Province/State", axis = 1, inplace = True)
    
# country_to_province(covid_data)


In [None]:
covid_data["ObsDate"] = pd.to_datetime(covid_data["ObservationDate"])

In [None]:
def days_since_first_obs(df, col_name="Day_Delta"):
    region_groups = df.groupby("Region")
    df[col_name] = np.nan
    for k, group in region_groups:
        group.sort_values(by="ObsDate", inplace = True)
        first_day = group.iloc[0]["ObsDate"]
        for i in range(len(group)):
            df.ix[(group.iloc[i]["SNo"]-1), col_name] = (group.iloc[i]["ObsDate"] - first_day).days

days_since_first_obs(covid_data)

In [None]:
covid_data

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

# we're going to rewrite our preprocessing functions as custom Sci-Kit classes 
# this will work better with other off the shelf Sci-Kit processes

class CombineLocations(BaseEstimator, TransformerMixin):
    def __init__(self, drop_original_regions = True): 
        self.drop_original_regions = drop_original_regions
    def fit(self, df, col_name = "Region"):
        return self
    def transform(self, df, col_name = "Region"):
        df.loc[df["Province/State"].isnull(), col_name] = df["Country/Region"] 
        df.loc[(~df["Province/State"].isnull()), col_name] = df["Province/State"]
        if self.drop_original_regions:
            df.drop("Country/Region", axis = 1, inplace = True)
            df.drop("Province/State", axis = 1, inplace = True) 

combine_locs = CombineLocations()
combine_locs.fit(covid_data)
combine_locs.transform(covid_data)
#covid_data

In [None]:
class DaysObserved(BaseEstimator, TransformerMixin):
    def __init__(self, drop_original_dates = True): 
        self.drop_original_dates = drop_original_dates
    def fit(self, X, col_name = "Day_Delta"):
        return self
    def transform(self, X, col_name = "Day_Delta"):
        X["ObsDate"] = pd.to_datetime(X["ObservationDate"])
        region_groups = X.groupby("Region")
        X[col_name] = np.nan
        for k, group in region_groups:
            group.sort_values(by = "ObsDate", inplace=True)
            first_day = group.iloc[0]["ObsDate"]
            for i in range(len(group)):
                X.ix[(group.iloc[i]["SNo"]-1), col_name] = (group.iloc[i]["ObsDate"] - first_day).days
        X.drop("ObsDate", axis = 1, inplace = True)
        if self.drop_original_dates:
            X.drop("ObservationDate", axis = 1, inplace = True)
                
days_observed = DaysObserved(drop_original_dates = False)
days_observed.fit(covid_data)
days_observed.transform(covid_data)
#covid_data

In [None]:
from pandas.plotting import scatter_matrix

# define a list of numerical attributes and categorical attributes
num_attributes = ["Confirmed", "Day_Delta", "Deaths", "SNo"]
cat_attributes = ["ObservationDate", "Region"]

scatter_matrix(covid_data[num_attributes], figsize=(12, 8))

In [1]:
from sklearn.impute import SimpleImputer

# imputer -> fillnas based on a policy and save statistics for later use (numerical features)
imputer = SimpleImputer(strategy="median")

# imputer is only for numerical features so lets try it out
covid_data_num = covid_data.copy()
covid_data_num.drop(cat_attributes, axis = 1, inplace = True)

# in this case, the .fit method is simply calculating the medians of each feature
imputer.fit(covid_data_num)

# imputer also saves statistics (median) as this will be of use if we need to transform new incoming data
# we won't have to re-fit the imputer instance
# some of our features may not even have had missing values, but the imputer applied the computation anyways
# print(imputer.statistics_)

# the transform method (which actually fills nas) returns a numpy array
covid_data_num_arr = imputer.transform(covid_data_num)
covid_data_num = pd.DataFrame(covid_data_num_arr, columns=covid_data_num.columns, index=covid_data_num.index)

NameError: name 'covid_data' is not defined

In [None]:
from sklearn.preprocessing import OneHotEncoder

# the onehotencoder turns categorical variables in to binary values for each possible value of a feature 
# this avoids "confusion" on the model's part, regarding distance related relationships 

covid_data.dropna(inplace = True)
onehot_encoder = OneHotEncoder()
covid_data_cat = covid_data.copy()
covid_data_cat.drop(num_attributes, axis = 1, inplace = True)

covid_data_cat1H = onehot_encoder.fit_transform(covid_data_cat)

In [None]:
from sklearn.preprocessing import StandardScaler
# standard scaler -> scale numerical features with scaling/standardization/normalization methods

std_scaler = StandardScaler()
covid_data_num_array = std_scaler.fit_transform(covid_data_num)
covid_data_num = pd.DataFrame(covid_data_num_arr, columns=covid_data_num.columns, index=covid_data_num.index)

