In [1]:
import urllib.request 
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import datetime

In [2]:
## Download and unzip COVID cases
temporary_folder = os.path.join(os.getcwd(), 'tmp')
if not os.path.exists(temporary_folder):
    os.makedirs(temporary_folder)
    
remote_file_name = "https://github.com/CSSEGISandData/COVID-19/archive/master.zip"
base_name = os.path.basename(remote_file_name)
local_file_name = os.path.join(temporary_folder, base_name)

urllib.request.urlretrieve(remote_file_name, local_file_name)

data_folder = os.path.join(os.getcwd(), 'world_data') 
if not os.path.exists(data_folder):
    os.makedirs(data_folder)
    
with zipfile.ZipFile(local_file_name, 'r') as zip_ref:
    zip_ref.extractall(data_folder)
    
#!unzip master.zip

In [3]:
## Download world population
remote_file_name = "http://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=excel"
world_population_file = os.path.join(temporary_folder, "world_population.xls")

# The following statement is throwing an error but the download is being performed
urllib.request.urlretrieve(remote_file_name, world_population_file) 

('/Users/wcunha69/Documents/projects/COVID-19/tmp/world_population.xls',
 <http.client.HTTPMessage at 0x1a23d1d6d0>)

In [4]:
## Load world population DataFrame
world_population = pd.read_excel(world_population_file, sheet_name="Data", dtype=object, skiprows=3)
world_population = world_population[["Country Name", "Country Code", "2018"]]
world_population = world_population.rename(
    {"Country Name":"country","Country Code":"country_code", "2018":"population"}, 
    axis='columns') 

world_population['population'] = world_population['population'].fillna(0)
world_population['population'] = world_population['population'].astype('int')
print(world_population)


          country country_code  population
0           Aruba          ABW      105845
1     Afghanistan          AFG    37172386
2          Angola          AGO    30809762
3         Albania          ALB     2866376
4         Andorra          AND       77006
..            ...          ...         ...
259        Kosovo          XKX     1845300
260   Yemen, Rep.          YEM    28498687
261  South Africa          ZAF    57779622
262        Zambia          ZMB    17351822
263      Zimbabwe          ZWE    14439018

[264 rows x 3 columns]


In [5]:
## Load COVID cases 
covid_folder = os.path.join(data_folder, 'COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/')

li = []
for covid_file in os.listdir(covid_folder):
    if covid_file.find(".csv") > -1:
        df = pd.read_csv(os.path.join(covid_folder, covid_file), index_col=None, header=0)
        li.append(df)

covid_cases = pd.concat(li, axis=0, ignore_index=True)
covid_cases = covid_cases.rename(
    {
        "Province/State":"province_state",
        "Country/Region":"country", 
        "Last Update":"last_update",
        "Confirmed":"confirmed",
        "Deaths":"deaths",
        "Recovered":"recovered",
        "Latitude":"latitude",
        "Longitude":"longitude"
    }, axis='columns'
) 


print(covid_cases)
#print(recent)
#covid = pd.read_csv(covid)
#print(covid)

                            province_state         country  \
0                                    Hubei  Mainland China   
1                                Guangdong  Mainland China   
2                                    Henan  Mainland China   
3                                      NaN     South Korea   
4                                 Zhejiang  Mainland China   
...                                    ...             ...   
7921                    Northern Territory       Australia   
7922  Lackland, TX (From Diamond Princess)              US   
7923                 Montgomery County, TX              US   
7924     Omaha, NE (From Diamond Princess)              US   
7925    Travis, CA (From Diamond Princess)              US   

              last_update  confirmed  deaths  recovered  latitude  longitude  
0     2020-02-26T14:13:10    65187.0  2615.0    20969.0       NaN        NaN  
1     2020-02-26T10:33:02     1347.0     7.0      851.0       NaN        NaN  
2     2020-02-26T1

In [6]:
## Adjusting country names and merging dataframes
world_population = world_population.replace("Korea, Rep.", "South Korea")
world_population = world_population.replace("Hong Kong SAR, China", "Hong Kong")
world_population = world_population.replace("Iran, Islamic Rep.", "Iran")
world_population = world_population.replace("Russian Federation", "Russia")
world_population = world_population.replace("Macao SAR, China", "Macau")
world_population = world_population.replace("Venezuela, RB", "Venezuela")
world_population = world_population.replace("Bahamas, The", "The Bahamas")
world_population = world_population.replace("Egypt, Arab Rep.", "Egypt")
world_population = world_population.replace("Slovak Republic", "Slovakia")
covid_cases = covid_cases.replace("Mainland China", "China")
covid_cases = covid_cases.replace("US", "United States")
covid_cases = covid_cases.replace("UK", "United Kingdom")
covid_cases = covid_cases.replace("Iran (Islamic Republic of)", "Iran")
covid_cases = covid_cases.replace("Russian Federation", "Russia")
covid_cases = covid_cases.replace("Republic of Korea", "South Korea")
covid_cases = covid_cases.replace("Hong Kong SAR", "Hong Kong")
covid_cases = covid_cases.replace("Viet Nam", "Vietnam")
covid_cases = covid_cases.replace("Macao SAR", "Macau")
covid_cases = covid_cases.replace("Korea, South", "South Korea")
covid_cases = covid_cases.replace("North Ireland", "United Kingdom")
covid_cases = covid_cases.replace("Republic of Ireland", "Ireland")

covid_cases_pop = pd.merge(covid_cases, world_population, how='left', on='country')
print(((covid_cases_pop[covid_cases_pop['population'].isnull()])["country"]).drop_duplicates())

9                                 Others
39                                Taiwan
421                  Taipei and environs
436       occupied Palestinian territory
490                        French Guiana
506                  Republic of Moldova
516                           Martinique
518                         Saint Martin
527                               Brunei
531                             Holy See
541                     Saint Barthelemy
577                          Cruise Ship
616                              Czechia
634                              Taiwan*
747                     Congo (Kinshasa)
757                              Reunion
992                            Palestine
1144                        Vatican City
2345                          Guadeloupe
2399                          Kyrgyzstan
2405                             Mayotte
2416                        Bahamas, The
2424                 Congo (Brazzaville)
2446                         Saint Lucia
2462    Saint Vi

In [7]:
## Remove countries with no population 
covid_cases_pop['population'] = covid_cases_pop['population'].fillna(0)
covid_cases_pop = covid_cases_pop.loc[covid_cases_pop['population'] != 0]
covid_cases_pop['population'] = covid_cases_pop['population'].astype('int')

covid_cases_aggr = covid_cases_pop[["country", "last_update", "population", "confirmed", "deaths"]]
covid_cases_aggr["last_update"] = pd.to_datetime(covid_cases_aggr["last_update"])
covid_cases_aggr["last_update"] = covid_cases_aggr["last_update"].dt.date




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
## Aggregate by country, date and population
covid_cases_aggr = covid_cases_aggr.groupby(["country", "last_update", "population"]).sum()
covid_cases_aggr = covid_cases_aggr.reset_index()

In [9]:
days_difference = covid_cases_aggr["last_update"].max() - covid_cases_aggr["last_update"].min()
max_days = days_difference.days + 4

countries = ((covid_cases_aggr[covid_cases_aggr["country"] == "Brazil"])["country"]).drop_duplicates()

li = []

for country in countries:
    country_cases = covid_cases_aggr[covid_cases_aggr["country"] == country]
    country_cases = country_cases[country_cases["confirmed"] > 0]
    country_dates = pd.date_range(start=country_cases.last_update.min(), end=covid_cases_aggr.last_update.max())
    country_cases = country_cases.set_index('last_update').reindex(country_dates).fillna(method='pad').rename_axis('last_update').reset_index()
    country_cases = country_cases.reset_index()
    country_cases.rename(columns = {'index':'days_since_first_case'}, inplace = True) 
    country_cases["type"] = "actuals"
    
    
    #x = (country_cases["days_since_first_case"]).to_numpy().reshape((-1, 1))
    #y = (country_cases["confirmed"]).to_numpy().reshape((-1, 1))
    x = np.array((country_cases["days_since_first_case"]))
    y = np.array((country_cases["confirmed"]))
    confirmed_cases_pred = np.polyfit(x, np.log(y), 1)
    
    population = int((country_cases["population"]).tail(1))
    death_rate = float((country_cases["deaths"].tail(1) * 1.0000) / (country_cases["confirmed"].tail(1) * 1.0000))
    for i in range(country_cases["days_since_first_case"].max() + 1 , country_cases["days_since_first_case"].max() + 6):
        confirmed = np.exp(confirmed_cases_pred[1]) * np.exp(confirmed_cases_pred[0] * i) 
        deaths = confirmed * death_rate
        row = {
            "days_since_first_case": i,
            "last_update": (country_cases["last_update"].min() + datetime.timedelta(days=i)),
            "country": country,
            "population": population,
            "confirmed": round(confirmed),
            "deaths": round(deaths),
            "type": "forecast"
        }
        country_cases = country_cases.append(row, ignore_index=True)
        #print(predicted_row)
        #y_pred = np.exp(curve_fit[1]) * np.exp(curve_fit[0] * i)
        #print(y_pred)
        #print(row)
    
    #print(country_cases)
    
    #print(curve_fit[0], curve_fit[1])
    #
    #print(y_pred, y)
    li.append(country_cases)

covid_cases_final = pd.concat(li, axis=0, ignore_index=True)
print(covid_cases_final)


#print(covid_cases_aggr["population"])

    days_since_first_case last_update country   population  confirmed  deaths  \
0                       0  2020-02-26  Brazil  209469333.0        3.0     0.0   
1                       1  2020-02-27  Brazil  209469333.0        3.0     0.0   
2                       2  2020-02-28  Brazil  209469333.0        3.0     0.0   
3                       3  2020-02-29  Brazil  209469333.0        8.0     0.0   
4                       4  2020-03-01  Brazil  209469333.0        8.0     0.0   
5                       5  2020-03-02  Brazil  209469333.0        8.0     0.0   
6                       6  2020-03-03  Brazil  209469333.0        8.0     0.0   
7                       7  2020-03-04  Brazil  209469333.0        8.0     0.0   
8                       8  2020-03-05  Brazil  209469333.0        8.0     0.0   
9                       9  2020-03-06  Brazil  209469333.0       26.0     0.0   
10                     10  2020-03-07  Brazil  209469333.0       26.0     0.0   
11                     11  2

In [10]:
## Add rate columns
covid_cases_final["cases_per_million"] = covid_cases_final["confirmed"] / (covid_cases_final["population"] / 1000000)
covid_cases_final["fatalities_per_million"] = covid_cases_final["deaths"] / (covid_cases_final["population"] / 1000000)
covid_cases_final["fatalities_per_cases"] = covid_cases_final["deaths"] / covid_cases_final["confirmed"]
print(covid_cases_final)

    days_since_first_case last_update country   population  confirmed  deaths  \
0                       0  2020-02-26  Brazil  209469333.0        3.0     0.0   
1                       1  2020-02-27  Brazil  209469333.0        3.0     0.0   
2                       2  2020-02-28  Brazil  209469333.0        3.0     0.0   
3                       3  2020-02-29  Brazil  209469333.0        8.0     0.0   
4                       4  2020-03-01  Brazil  209469333.0        8.0     0.0   
5                       5  2020-03-02  Brazil  209469333.0        8.0     0.0   
6                       6  2020-03-03  Brazil  209469333.0        8.0     0.0   
7                       7  2020-03-04  Brazil  209469333.0        8.0     0.0   
8                       8  2020-03-05  Brazil  209469333.0        8.0     0.0   
9                       9  2020-03-06  Brazil  209469333.0       26.0     0.0   
10                     10  2020-03-07  Brazil  209469333.0       26.0     0.0   
11                     11  2