In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

## Reading the Covid, Zika and Dengue datasets here

In [6]:
import pandas as pd

# this is the link to the COVID-19 data
covid_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
covid_death_url_global = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'

raw_cases_df = pd.read_csv(covid_url, index_col=0, dtype='object')
raw_deaths_df = pd.read_csv(covid_death_url_global, index_col=0, dtype='object')

def clean_covid_dataframe(aDf, final_column_name):

    raw_df = aDf.copy()
    raw_df = raw_df.reset_index()
    temp_df = raw_df.rename(columns = {'Province/State': 'state',
                                       'Country/Region': 'country',
                                       'Lat': 'lat',
                                       'Long': 'long'
                                      })

    # removing the latitude and the logitude columns as they are not required
    temp_df = temp_df.drop(columns = ['lat', 'long'])

    # unpivoting the table to see the cases per country and the date
    df = pd.melt(temp_df, id_vars = ['country','state'], var_name='date', value_name = final_column_name)
    df.date = pd.to_datetime(df.date)
    df[final_column_name] = pd.to_numeric(df[final_column_name])

    return df

cases = clean_covid_dataframe(raw_cases_df, final_column_name ='no_cases')
deaths = clean_covid_dataframe(raw_deaths_df, final_column_name ='no_deaths')

covid = cases.set_index(['country','state','date']).join(
    deaths.set_index(['country','state','date']),
    how='left').reset_index()

covid['year'] = covid.date.dt.year
covid['month'] = covid.date.dt.month

covid.to_csv('Datasets/covid_19_data.csv.gz', index=False)


In [43]:
covid_data = pd.read_csv('Datasets/covid_19_data.csv.gz',parse_dates=[2])
dengue_data = pd.read_csv('Datasets/dengue_data.csv.gz',parse_dates={'date' : [14]})
zika_data = pd.read_csv('Datasets/zika16to18.csv',parse_dates=[0])
malaria_data = pd.read_csv('Datasets/malaria_data.csv.gz',parse_dates={'Date' : [2, 15, 16]})

In [37]:
zika_data.columns = ['_'.join(c.lower().split(' ')) for c in zika_data.columns]
zika_data.columns = ['_'.join(c.lower().split(' ')) for c in zika_data.columns]
zika_data = zika_data.rename(columns= {'noncumulative_confirmed': 'no_cases',
                                       'noncumulative_zika_case_deaths': 'no_deaths',
                                       'confirmed': 'cumulative_cases',
                                       'zika_case_deaths': 'cumulative_deaths',
                                       'country/territory': 'country'})

# can delete now within read csv
    # dengue_data['date'] = pd.to_datetime(dengue_data.epi_week_start_date)
    # covid_data['date'] = pd.to_datetime(covid_data.date)
    # zika_data['date'] = pd.to_datetime(zika_data.date)
    # malaria_data['date'] = pd.to_datetime(malaria_data.date)

### Adding the disease and transmission_mode columns

In [38]:
covid_data['disease'] = 'covid-19'
dengue_data['disease'] = 'dengue'
zika_data['disease'] = 'zika'
malaria_data['disease']='malaria'

covid_data['transmission_mode'] = 'air'
dengue_data['transmission_mode'] = 'mosquito'
zika_data['transmission_mode'] = 'mosquito'
malaria_data['transmission_mode']='mosquito'


### Checking if the must columns are present in the datasets are not

In [39]:
must_columns = ['country','date','year','month','day','transmission_mode',
                'no_cases','no_deaths','cumulative_cases','cumulative_deaths','disease']

assert pd.Series(must_columns).isin(covid_data.columns).all()
assert pd.Series(must_columns).isin(dengue_data.columns).all()
assert pd.Series(must_columns).isin(zika_data.columns).all()
assert pd.Series(must_columns).isin(malaria_data.columns).all()

AssertionError: 

In [31]:
# may be doing with the week number 
# clean up the country column
# Merge the malaria data
    ## Instead of the spikes by country and we just cateorize it by the continent - that way we only have the 7 thingst to  manupulate
    
# weekly counts is safer

In [51]:
zika_data

Unnamed: 0,Date,Country/Territory,Suspected,Confirmed,Imported Cases,Incidence Rate,Zika Case Deaths,Zika Congenital Syndrome,Population x1000e,Noncumulative Suspected,Noncumulative Confirmed,Noncumulative Zika Case Deaths,Noncumulative Zika Congenital Syndrome
0,2016-11-17,Paraguay,546,12,0.0,8.297398,0.0,2.0,6725,,,,
1,2016-11-17,Uruguay,0,0,1.0,0.000000,0.0,0.0,344,,,,
2,2016-11-17,Subtotal,2367,38,57.0,3.483084,0.0,3.0,69048,,,,
3,2016-11-17,Anguilla,40,5,1.0,264.705882,0.0,0.0,17,,,,
4,2016-11-17,Antigua and Barbuda,393,14,2.0,432.978723,0.0,0.0,94,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653,2017-12-21,Subtotal,172905,17073,41.0,136.072772,0.0,276.0,139615,82718.0,-25163.0,-5.0,131.0
3654,2017-12-21,Brazil,231725,137288,0.0,176.095308,11.0,2952.0,209553,0.0,0.0,0.0,0.0
3655,2017-12-21,Argentina,539,278,41.0,1.854290,0.0,5.0,44060,0.0,0.0,0.0,0.0
3656,2017-12-21,Canada,0,0,544.0,0.000000,0.0,1.0,36284,0.0,0.0,0.0,0.0


In [52]:
covid_data['week'] = covid_data['date'].dt.week
covid_data['year'] = covid_data['date'].dt.year
dengue_data['week'] = dengue_data['date'].dt.week
covid_data['year'] = covid_data['date'].dt.year
malaria_data['week'] = malaria_data['Date'].dt.week
malaria_data['year'] = malaria_data['Date'].dt.year
zika_data['week'] = zika_data['Date'].dt.week
zika_data['year'] = zika_data['Date'].dt.year

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until
  """
  import sys


In [54]:
df = pd.concat([covid_data[must_columns],
                zika_data[must_columns],
                dengue_data[must_columns],
               malaria_data[must_columns]])

df['week'] = df.date.dt.week

KeyError: "['disease', 'transmission_mode', 'cumulative_deaths', 'day', 'cumulative_cases'] not in index"

In [33]:
df.head()

Unnamed: 0,country,date,year,month,day,transmission_mode,no_cases,no_deaths,cumulative_cases,cumulative_deaths,disease,week
0,Afghanistan,2020-01-22,2020,1,22,air,0.0,0.0,0.0,0.0,covid-19,4
1,Afghanistan,2020-01-23,2020,1,23,air,0.0,0.0,0.0,0.0,covid-19,4
2,Afghanistan,2020-01-24,2020,1,24,air,0.0,0.0,0.0,0.0,covid-19,4
3,Afghanistan,2020-01-25,2020,1,25,air,0.0,0.0,0.0,0.0,covid-19,4
4,Afghanistan,2020-01-26,2020,1,26,air,0.0,0.0,0.0,0.0,covid-19,4
