# Visualization of Tokyo infections

Here I download number of Tokyo infections data and by combining with age and trackable ratio I will make more informative plot

In [1]:
## import libs

import json
import pandas as pd
import plotly.express as px

Age related daily infections data is obtained from the following link

https://catalog.data.metro.tokyo.lg.jp/dataset/t000010d0000000068/resource/c2d997db-1450-43fa-8037-ebb11ec28d4c

Daily trackable information is obtained from the following link

https://github.com/tokyo-metropolitan-gov/covid19/tree/development/data

In [2]:
## open age related data
covid_data_1 = pd.read_csv('130001_tokyo_covid19_patients.csv')

In [3]:
## data columns are in Japanese but easy to identify age related column
covid_data_1.columns

Index(['No', '全国地方公共団体コード', '都道府県名', '市区町村名', '公表_年月日', '曜日', '発症_年月日',
       '患者_居住地', '患者_年代', '患者_性別', '患者_属性', '患者_状態', '患者_症状', '患者_渡航歴の有無フラグ',
       '備考', '退院済フラグ'],
      dtype='object')

In [4]:
## extract only first 2 parts of the string in Age column
covid_data_1['Age'] = [a[:2] for a in covid_data_1['患者_年代']]

In [5]:
## age contains some kanji string. I apply lambda and replace those string as zero
## later I will drop them after merging with the second data
covid_data_1['Age'] = covid_data_1['Age'].apply(lambda x: int(x) if x.isdigit() else 0)

In [6]:
## only small number of obs have no age information
covid_data_1.loc[covid_data_1.Age == 0]

Unnamed: 0,No,全国地方公共団体コード,都道府県名,市区町村名,公表_年月日,曜日,発症_年月日,患者_居住地,患者_年代,患者_性別,患者_属性,患者_状態,患者_症状,患者_渡航歴の有無フラグ,備考,退院済フラグ,Age
2245,2187,130001,東京都,,2020-04-14,火,,都内,不明,男性,,,,,,1.0,0
2828,2769,130001,東京都,,2020-04-17,金,,都内,-,女性,,,,,,1.0,0
2839,2781,130001,東京都,,2020-04-17,金,,都内,不明,男性,,,,,,1.0,0
3662,3601,130001,東京都,,2020-04-24,金,,-,-,-,,,,,,1.0,0
3755,3695,130001,東京都,,2020-04-24,金,,都内,不明,女性,,,,,,1.0,0
3877,3807,130001,東京都,,2020-04-25,土,,都内,不明,男性,,,,,,1.0,0
4451,4339,130001,東京都,,2020-05-02,土,,都内,不明,女性,,,,,,1.0,0
4534,4425,130001,東京都,,2020-05-02,土,,都内,不明,女性,,,,,,1.0,0


In [7]:
## extract non zero age obs
covid_data_1 = covid_data_1.loc[covid_data_1['Age'] != 0]

In [8]:
## make date column
covid_data_1['Date'] = pd.to_datetime(covid_data_1['公表_年月日'])

In [9]:
## get month/day data
covid_data_1['Day'] = [a.day for a in covid_data_1.Date]
covid_data_1['Month'] = [a.month for a in covid_data_1.Date]

In [10]:
## now groupby month/day and get average age of all infected people for each day
covid_data_avg_age = covid_data_1[['Age','Day','Month']].groupby(['Month','Day']).mean()

In [11]:
covid_data_avg_age.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Age
Month,Day,Unnamed: 2_level_1
1,24,40.0
1,25,30.0
1,30,30.0


Work on the second data

In [12]:
## now import the data from the 2nd source
with open('../../Desktop/corona_japan.json', "r") as fd:
    covid_data = json.load(fd)

In [13]:
df = pd.DataFrame()

df['Dates'] = [a['diagnosed_date'] for a in covid_data['data']]
df['Total'] = [a['count'] for a in covid_data['data']]
df['Untracked'] = [a['missing_count'] for a in covid_data['data']]
df['Tracked'] = [a['reported_count'] for a in covid_data['data']]

In [14]:
df['Day'] = [a.day for a in pd.to_datetime(df['Dates'])]
df['Month'] = [a.month for a in pd.to_datetime(df['Dates'])]

In [15]:
## get the ratio of untracted to the total infections
df['Ratio'] = df.Untracked / df.Total

In [16]:
## set index to merge with the first data
df.set_index(['Month','Day'], inplace=True)

In [17]:
df.tail(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Dates,Total,Untracked,Tracked,Ratio
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8,12,2020-08-12,222,149.0,73.0,0.671171
8,13,2020-08-13,206,128.0,78.0,0.621359
8,14,2020-08-14,389,242.0,147.0,0.622108


In [18]:
## merge data
df_merged = pd.merge(covid_data_avg_age, df, left_index=True, right_index=True)

In [19]:
df_merged.dropna(inplace=True)
df_merged.tail(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Dates,Total,Untracked,Tracked,Ratio
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8,12,33.918919,2020-08-12,222,149.0,73.0,0.671171
8,13,34.126214,2020-08-13,206,128.0,78.0,0.621359
8,14,34.421594,2020-08-14,389,242.0,147.0,0.622108


In [None]:
### now plot the data

fig = px.scatter(df_merged, y="Total", x="Dates",color="Age", size='Ratio',
                labels={
                "Total": "Infections",
                "Age":"Average age",
                'Ratio':'Ratio of untracked cases'
                },
                width=1000, height=500)

fig.update_layout(
    title={
        'text': "Tokyo Infections",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()
fig.write_html('tokyo_infections_.html')
