## Table of Contents
[00. Set up Connection](#00.-Set-up-Connection)

[01. Load Data](#01.-Load-Data)

[02. Study Data](#02.-Study-Data)


In [10]:
import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role

## 00. Set up Connection

In [2]:
s3 = boto3.resource('s3')
# listing buckets
! aws s3 ls

2020-07-27 08:25:21 wwcode-covid-datasets


In [3]:
# show files in bucket
! aws s3 ls wwcode-covid-datasets/ --recursive

2020-07-27 11:15:54    4702273 apple-mobility-trends-reports/dataset/apple-mobility-trends-reports.csv
2020-07-27 08:25:45    3180014 coronadatascraper/202007270201/coronadatascraper-timeseries__202007270201__202007270201.csv.gz


## 01. Load Data

In [4]:
def load_data_from_s3(bucket, data_key):
    data_location = 's3://{}/{}'.format(bucket, data_key)
    data = pd.read_csv(data_location)
    return data

In [5]:
role = get_execution_role()
bucket='wwcode-covid-datasets'

# covid data figures
data_key = 'coronadatascraper/202007270201/coronadatascraper-timeseries__202007270201__202007270201.csv.gz'
covid_df = load_data_from_s3(bucket, data_key)

# mobility data figures
data_key = 'apple-mobility-trends-reports/dataset/apple-mobility-trends-reports.csv'
mobility_df = load_data_from_s3(bucket, data_key)

  if (await self.run_code(code, result,  async_=asy)):


In [6]:
covid_df.head()

Unnamed: 0,name,level,city,county,state,country,population,lat,long,url,...,tested,hospitalized,hospitalized_current,discharged,icu,icu_current,growthfactor,date,last_updated_at,state_fips
0,"Antwerp, Flanders, Belgium",county,,Antwerp,Flanders,Belgium,1847486.0,51.2485,4.7175,https://epistat.wiv-isp.be/,...,,,,,,,,2020-01-22,2020-07-27T02:01:25,
1,"Antwerp, Flanders, Belgium",county,,Antwerp,Flanders,Belgium,1847486.0,51.2485,4.7175,https://epistat.wiv-isp.be/,...,,,,,,,1.0,2020-01-23,2020-07-27T02:01:25,
2,"Antwerp, Flanders, Belgium",county,,Antwerp,Flanders,Belgium,1847486.0,51.2485,4.7175,https://epistat.wiv-isp.be/,...,,,,,,,1.0,2020-01-24,2020-07-27T02:01:25,
3,"Antwerp, Flanders, Belgium",county,,Antwerp,Flanders,Belgium,1847486.0,51.2485,4.7175,https://epistat.wiv-isp.be/,...,,,,,,,1.0,2020-01-25,2020-07-27T02:01:25,
4,"Antwerp, Flanders, Belgium",county,,Antwerp,Flanders,Belgium,1847486.0,51.2485,4.7175,https://epistat.wiv-isp.be/,...,,,,,,,1.0,2020-01-26,2020-07-27T02:01:25,


In [7]:
mobility_df.head()

Unnamed: 0,geo_type,region,transportation_type,alternative_name,sub-region,country,2020-01-13,2020-01-14,2020-01-15,2020-01-16,...,2020-07-16,2020-07-17,2020-07-18,2020-07-19,2020-07-20,2020-07-21,2020-07-22,2020-07-23,2020-07-24,2020-07-25
0,country/region,Albania,driving,,,,100.0,95.3,101.43,97.2,...,153.0,169.21,178.32,185.37,166.63,162.41,164.63,169.36,190.83,206.45
1,country/region,Albania,walking,,,,100.0,100.68,98.93,98.46,...,137.78,128.27,137.61,107.59,136.33,127.42,128.96,131.3,133.39,137.95
2,country/region,Argentina,driving,,,,100.0,97.07,102.45,111.21,...,54.05,65.19,63.43,41.21,66.69,54.61,55.66,57.26,67.02,63.7
3,country/region,Argentina,walking,,,,100.0,95.11,101.37,112.67,...,39.18,47.81,43.67,32.06,50.69,43.16,41.49,42.23,49.1,46.25
4,country/region,Australia,driving,AU,,,100.0,102.98,104.21,108.63,...,106.12,103.27,84.93,88.73,89.78,92.33,93.33,98.55,98.5,75.84


In [12]:
np.issubdtype(mobility_df['2020-01-13'].dtype, np.number)

True

In [8]:
mobility_df['geo_type'].drop_duplicates()

0       False
1       False
2       False
3       False
4       False
        ...  
3620    False
3621    False
3622    False
3623    False
3624    False
Name: geo_type, Length: 3625, dtype: bool

## 02. Study Data

In [15]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

def get_data_info_by_columns(data):
    for col in data.columns:
        print('Displaying info for column "'+col+'"...')
        if np.issubdtype(data[col].dtype, np.number):
            ax = sns.distplot(x=col, data=data)
            sns.countplot(x=col, data=data, order=data.col.value_counts().iloc[:10].index)
        else:
            ax = sns.countplot(data[col])
        plt.show()
            
get_data_info_by_columns(covid_df)

Displaying info for column "name"...


KeyboardInterrupt: 