# The Covid Data Analysis Object

## Introduction
The Covid_Data class takes data from the covid-19 api and makes it easy and fast to extract information. It has a number of methods that can pull summary data and graphs from the information provided by the API. Run Covid_Data.\__doc__ to see methods.

Once you have created an instance for the first time, it is fastest to use the .save() and load() methods, not create a full new instance each time you start up the program.

### Extending the Class
A note on the data's structure:

The data are stored in two objects: a pandas dataframe with information on each unique reporting location (i.e. "Baltimore, MD", "Paris, FR") called self._loc_info and a rank 4 numpy array containing all covid records called self._covid_data. The numpy array's axes are Location, Date, Data type ("Confirmed", "Deaths", etc.), and Data (Total, percent, etc.)


### Note on the status of this notebook

This is an ongoing personal project. As it stands, a user of this notebook can use the class to pull down fresh data and build summary dataframes of global Covid Data. 

Outstanding goals of the project are:

- Finishing the .inspect() function so that the user can specify tighter constraints on the summary dataframe
- Develop a .disp() function that visualizes the data from an .inspect() dataframe result
- Integrate population data to include per capita rates to the data



In [None]:
import pandas as pd
from multiprocessing import Pool
import multiprocessing as mp
import numpy as np
import json, requests, datetime, itertools
from IPython.display import clear_output, display
%matplotlib inline
from matplotlib import pyplot
class Covid_Data:
    
    def __init__(self,file1='', file2=''):
        """Sets up new object from saved files. Or creates
        an empty object waiting for .load().
        
        Attributes
        ----------
        File1 : string, optional
            Can be .json, .pkl, or empty.
            OR if 'New', class will download fresh data from the API
        File2 : string, optional
            Can be .npy or empty."""
        self._cores = mp.cpu_count()
        #load file data
        if file2:
            self.load(file1,file2)
        elif file1 == 'New':
            print('updating')
            self.update_data()
        elif file1:
            self._initial_setup(file1)
        else:
            print('Created empty data object.\nRun .load(file1,file2) or .update_data() '+
                  'to get started!') 
        #set initial filter attributes
        self.start_date = datetime.date(2020,1,22)
        days = datetime.timedelta(self._covid_data.shape[1])
        self.end_date = self.start_date+days
        self.locations = [[],[],[]]
        self._week_data
        self.current_period = 'Daily'
        
    #Housekeeping/instrumental Methods
    def _parallelize_dataframe(self,df, func, n_cores=4):
        #Takes dataframe operations and parallelizes them across all cpu cores.
        df_split = np.array_split(df, n_cores)
        pool = Pool(n_cores)
        df = pd.concat(pool.map(func, df_split))
        pool.close()
        pool.join()
        return df
    def _parallelize_array(self, df, func, n_cores=4):
        #Takes numpy array operations and parallelizes them across all cpu cores.
        df_split1 = np.array_split(df, n_cores)
        df_split = []
        count = 0 
        for arr in range(0,df_split1):
            df_split.append([df_split1[arr],count])
            count += df_split[arr].shape[0]
        pool = Pool(n_cores)
        df = np.concatenate(pool.map(func, df_split),axis=0)
        pool.close()
        pool.join()
        return df
    def _update_progress(self,progress):
        #this is a simple function that updates a loading bar
        bar_length = 20
        if isinstance(progress, int):
            progress = float(progress)
        if not isinstance(progress, float):
            progress = 0
        if progress < 0:
            progress = 0
        if progress >= 1:
            progress = 1
        block = int(round(bar_length * progress))
        clear_output(wait = True)
        text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
        print(text)
        
        
    #Internal Methods
    def _initial_setup(self, jsonfile): #takes a saved JSON file and sets up necessary data structures.
        #Open and load JSON file
        f = open(jsonfile)
        data = json.load(f)
        f.close()
        self._update_progress(.04) #create loading bar
        
        #dataframe to hold JSON data
        self._df = pd.DataFrame.from_dict(data, orient='columns')
        
        #drop duplicates for each location and save location data into _loc_info
        loc_info = self._df.drop_duplicates(subset = ['Country','Province','City'])
        self._loc_info = loc_info[['Country','Province','City','Lat','Lon']]
        self._update_progress(.1)
        
        #break df into location groups
        loc_groups = self._df.groupby(['Country','Province','City'])
        
        #build array of covid data, axes: [location, dates, data_type, analysis type]
                                           #loc_count, days since 1/22, 4, 3
        self._covid_data = np.array([self._build_data_init(group) for group in loc_groups])
        #WORK -- parralelize this list comprehension 
        self._update_progress(.5)
        self._covid_data = np.moveaxis(self._covid_data,1,-1)
        self._update_progress(1)
        
        
    def _build_data_init(self,arr):
        totals = arr[1][['Confirmed','Deaths','Recovered','Active']].to_numpy(dtype=float,copy=True)
        return self._build_data(totals)
    
    def _build_data(self,totals):
        yesterdays = np.vstack((totals[1:],np.array([0,0,0,0]))) #make a new totals shifted one day 
        deltas = np.subtract(totals,yesterdays) #find the difference between totals and yesterdays
        with np.errstate(divide='ignore', invalid='ignore'): # find the quotient of deltas and totals, ignore divide by zero error
            percents = np.true_divide(deltas,totals)
            percents[percents == np.inf] = 0
            percents = np.nan_to_num(percents)
        num_data = np.array([totals, deltas, percents]) #create an array with rank data type, date/loc, data cat
        return(num_data)

    def _get_new_data(self):
        #This method queries covid19api.com for new data and saves to a JSON file
        import requests

        url = "https://api.covid19api.com/all"

        payload = {}
        headers= {}

        response = requests.request("GET", url, headers=headers, data = payload)
        json_data = response.json()
            
        #all_data = requests.get("https://api.covid19api.com/all")
        
        save_data = json.dumps(json_data)
        writeFile = open('covid_data.json', 'w')
        writeFile.write(save_data)
        writeFile.close()
        return('covid_data.json')
    
    
    #Properties
    @property
    def current_period(self):
        """The current period decides on the time granularity of the data. 
        It can be set to 'Daily' or 'Weekly' (default, 'Daily')."""
        return self._current_period
    @current_period.setter
    def current_period(self, period='Daily'):
        if period.lower() == 'daily':
            self._current_period = 'daily'
        elif period.lower() == 'weekly':
            self._current_period = 'weekly'
        else:
            self._current_period = 'daily'
            print('You must use "Daily" or "Weekly" to set period.')
    @property
    def _week_data(self):
        #pull every loc, every 7th day, all measures and measure types
        week_totals = self._covid_data[:,::7,:,0]
        #array from [build_data(loc) for loc in totals]
        self.__week_data = np.array([self._build_data(loc) for loc in week_totals], dtype=float, copy = True)
        return self.__week_data
    @property
    def start_date(self):
        """Start date sets the earliest date of data to consider in .inspect().
        It defaults to 1 Jan 2020. It can be set directly with a datetime.date object
        or with the .set_start_date() method."""
        return self._start_date
    @start_date.setter
    def start_date(self,start_date):
        beg = datetime.date(2020,1,22)
        end = datetime.date.today()
        if (start_date - beg).days < 0 or (end - start_date).days < 1:
            print("Start date must be on or after Jan 22, 2020 and before today.")
        else:
            self._start_date = start_date
    @property
    def end_date(self):
        """End date sets the latest date of data to consider in .inspect().
        It defaults to today. It can be set directly with a datetime.date object
        or with the .set_end_date() method."""
        return self._end_date
    @end_date.setter
    def end_date(self, end_date):
        day_num = datetime.timedelta(self._covid_data.shape[1])
        first = datetime.date(2020,1,22)
        last = first + day_num
        if (end_date - last).days >0 or (end_date - self.start_date).days< 1:
            print('End date must come after start date and be on or before the last day in the downloaded data.'+
                  '\nRun .update_data to refresh data through today.')
            self._end_date = last
        else:
            self._end_date = end_date
    @property
    def locations(self):
        """Filters data by country, province, and city. Use .set_locations() to
        select filter."""
        return self._locations
    @locations.setter
    def locations(self, locations=[[],[],[]]):
        #break locations into country, province, and city.
        countries= locations[0]
        provinces= locations[1]
        cities= locations[2]

        locs = self._loc_info.copy()
        ind = pd.Series(range(0,len(locs)))
        locs = locs.set_index(ind)
        
        #at each step, filter for rows that match the request
        if len(countries):
            locs = locs[locs['Country'].isin(countries)]
            if len(provinces):
                locs = locs[locs['Province'].isin(provinces)]
                if len(cities):
                    locs = locs[locs['City'].isin(cities)]
        self._locations = locs
    
    #Methods
    
    def set_start_date(self, year=2020,month=1,day=22):
        """Easily enter start day.
        
        Parameters
        ----------
        year : int, optional
            Start year (default is 2020).
        month : int, optional
            Start month (default is 1).
        day : int, optional
            Start day (default is 22)."""
        self.start_date = datetime.date(year,month,day)
    def set_end_date(self, year, month, day):
        """Easily enter end day.
        
        Parameters
        ----------
        year : int, required
            End year.
        month : int, required
            End month.
        day : int, required
            End day."""
        if month and day:
            self.end_time = datetime.date(year,month,day)
        else:
            self.end_time = datetime.date.today()

    def inspect(self, groupby = 'All', data_measure=['Confirmed','Deaths','Recovered','Active'], 
                measure_type=['Total','Change In', 'Percent'], period=None, 
                start_date = None, end_date = None, 
                locations = None):
        
        """Inspect uses the class filter attributes, some 
        other filters, and some other variables to return a 
        dataframe of specific Covid-19. 
        
        Parameters
        ----------
        groupby : str, optional
            Returns summary data by given group (options 
            are 'None', 'City', 'Province', 'Country', or '
            Dates', default is 'None').
        data_measure: list, optional
           List of data measures to include. By default, it 
           includes all options: ['Confirmed', 'Deaths', 
           'Recovered', 'Active']. 
        measure_type
            List of types of each measure to include. By 
            default, it includes all options: ['Total', 
            'Change In', 'Percent'], 
        period : str, optional
            Choose 'weekly' or 'daily' data (default is 
            self.current_period).
        start_date : datetime.date object, optional
            Set start date of data (default is self.start_data).
        end_date : datetime.date object, optional
            Set end date of data (default is self.end_date).
        locations: list, optional
            Enter list of length three containing lists of 
            countries, provinces, and cities to filter by. 
            It is recommend to use .set_locations() instead 
            of entering locations manually (default is self.locations).
            """ 
        if period is None:
            period=self.current_period
        if start_date is None:
            start_date = self.start_date
        if end_date is None:
            end_date = self.end_date
        if locations is None:
            locations = self.locations
        if period.lower() == 'weekly':
            inspect_arr = self._week_data
        else:
            inspect_arr = self._covid_data
        #get indices of locations, filter the inspect_arr by location
        loc_indices = self.locations.index.values.tolist()
        #get the count for start and end date, filter axis 2 (dates)
        start = (self.start_date - datetime.date(2020,1,22)).days
        end = (self.end_date - datetime.date(2020,1,22)).days
        #get the type of data measure, and match to indices
        measures = []
        measure_names = []
        for meas in data_measure:
            if meas == 'Confirmed':
                measures.append(0)
                measure_names.append(meas)
            elif meas == 'Deaths':
                measures.append(1)
                measure_names.append(meas)
            elif meas == 'Recovered':
                measures.append(2)
                measure_names.append(meas)
            elif meas == 'Active':
                measures.append(3)
                measure_names.append(meas)
        
        #get type of measure and match to indices
        types = []
        type_names = []
        for typ in measure_type:
            if typ == 'Total':
                types.append(0)
                type_names.append(typ)
            elif typ == 'Change In':
                types.append(1)
                type_names.append(typ)
            elif typ == 'Percent':
                types.append(2)
                type_names.append(typ)
        inspect_arr = inspect_arr[loc_indices]
        inspect_arr = inspect_arr[:,start:end,:,:]
        inspect_arr = inspect_arr[:,:,measures,:]
        inspect_arr = inspect_arr[:,:,:,types]
        self._current_arr = inspect_arr
        ##BUILD Dataframe
        if groupby == 'All':
            #get number of time entries (days or weeks)
            loc_ents, time_ents, measures, types = self._current_arr.shape
            #vstack first axis of array
            concat_locs = self._current_arr.reshape(loc_ents*time_ents, measures, types)
            loc_count = concat_locs.shape[0]
            #hstack last axis of array
            concat_locs = np.moveaxis(concat_locs, 0,-1)
            formatted = concat_locs.reshape(measures*types,loc_count)
            formatted = np.moveaxis(formatted,0,-1)
            #make it a dataframe
            columns = [x[0]+' '+x[1] for x in list(itertools.product(measure_names, type_names))]
            df = pd.DataFrame(formatted, columns=columns)
            sep = np.array_split(df, len(self.locations))

            #function that adds loc data to sep DFs, vectorize, run
            #add_locs = np.vectorize(self.add_locs)
            locationsnow = self.locations
            locationsnow = locationsnow.set_index(np.arange(0,len(locationsnow)))
            all_dfs = self.add_locs(sep, locationsnow)
            self._current_df = pd.concat(all_dfs)
            return self._current_df
    
    def add_locs(self,dfs_arr, loc_info):
        for index, df in enumerate(dfs_arr):
            df['Country'] = loc_info['Country'].iloc[index]
            df['Province'] = loc_info['Province'].iloc[index]
            df['City'] = loc_info['City'].iloc[index]
            df['Lat'] = loc_info['Lat'].iloc[index]
            df['Lon'] = loc_info['Lon'].iloc[index]
            df['Date'] = self.start_date + datetime.timedelta(index)
        return dfs_arr
    
    def set_locations(self):
        print("""This process will filter Covid Data by entering 
countries, provinces, and cities of interest.
We'll start by entering countries. 
        
        """)
        #setup categories for string formatting
        cats = [('country', 'countries'), ('province', 'provinces'), ('city','cities')]
        loc_collect = [[],[],[]]
        for index, cat in enumerate(cats):
            sing = cat[0]
            plur = cat[1]
            
            #start with all locations, then filter based on user's previous entries.
            if index == 0:
                cur_locs = self._loc_info
            else:
                cur_locs = cur_locs.loc[cur_locs[cats[index-1][0].capitalize()].isin(loc_collect[index-1])]
             
            #make a list of the current location optics, variable for user response, and response collector
            all_list = cur_locs[sing.capitalize()].unique().tolist()
            response = ''
            loc_filter = []
            #format massages to user
            error = """That did not match a {} in the location list.
            Please enter a country, 'All', 'list [search string]', 'help', or 'done'.""".format(sing)
            
            instructions = """
Let's enter {} of interest.

You can enter 'list' to see all {}. 
If you add a space and a string after 'list', I'll return any 
{} whose name contains that string.

Type in a {} name and hit enter, and I'll add it to our filter.

If you enter 'All', I'll add all {} to the filter.

Type in 'Done' and I'll move forward.""".format(plur,plur,sing,sing,plur)
            
            print(instructions)
            
            #loop on location request 
            while not response.lower() == 'done':
                response = input("Enter a {} (enter 'help' for instructions) ".format(sing))
                if response[0:4].lower() == 'list':
                    if len(response) > 5:
                        search = response[5:].lower()
                        matches = [loc for loc in all_list if search in loc.lower()]
                        print(matches)
                    else:
                        print(all_list)
                elif response.lower() == 'help':
                    print(instructions)
                elif response.lower() == 'exit':
                    return
                elif response.lower() == 'done':
                    pass
                elif response.lower() == 'all':
                    loc_collect[index] = all_list
                    response = 'done'
                elif response in all_list:
                    loc_collect[index].append(response)
                else:
                    print(error)
        #if the user entered no locations, add all locations
        if not loc_collect[index]:
            loc_collect[index] = all_list
            
        self.locations = loc_collect
        
    def update_data(self):
        """Get fresh pull from covid19api, update JSON file, update live data object"""
        proceed = input('This will download 10MB of data and should be used infrequently. '+
                       '\nAre you sure you want to proceed? (Y/N) : ')
        if proceed in ['y','Y','Yes','yes']:
            print('This will take some time.')
            file = self._get_new_data()
            self._initial_setup(file)
    def save(self):
        """Save current covid data into two files for quick access next time."""
        self._loc_info.to_pickle('location_info.pkl')
        np.save('covid_data.npy',self._covid_data)
    def load(self, file1, file2):
        """Load data from saved session for quick access
        Parameters
        ----------
        file1 : str , file name for the .pkl data
        file2 : str , file name for the .npy data
        
        Raises
        ------
        NameError
            Wrong file type given for file1 or file2"""
        if file1[-3:] == 'pkl' and file2[-3:] == 'npy':
            self._loc_info = pd.read_pickle(file1)
            self._covid_data = np.load(file2)
        else:
            raise NameError('File1 must be .pkl and File2 must be .npy')
                  


In [None]:
#The following line will pull down new covid data. However, the API was having timeout issues when I tested today.
#It may or may not work

#covid = Covid_Data('New')

#This uses the .json file I download previously. It is missing the last several weeks of data.
covid = Covid_Data("covid_data.json")
covid.locations=[['United States of America'],['Maryland'],[]]

#You can use the following method to select specific locations you'd like to inspect
#covid.set_locations()

covid.set_start_date(2020,4,1)
covid.set_end_date(2020,4,30)
df = covid.inspect()
df