In [4]:
import re
import json
import requests
import datetime
import pandas as pd
from googletrans import Translator
from sqlalchemy import create_engine
# from collections import defaultdict

In [5]:
url = 'https://3g.dxy.cn/newh5/view/pneumonia'

* ### Get the China Province and City Data (just for preview)

In [6]:
# use regular expression to get the html infomation
raw_html = requests.get(url).content.decode('utf8')
match = re.search('window.getAreaStat = (.*?)}catch', raw_html)
raw_json = match.group(1)
result1 = json.loads(raw_json, encoding='utf8')

## Part 1: Get the World Wide Total Data

In [7]:
# use regular expression to get the html infomation
raw_html = requests.get(url).content.decode('utf8')
match = re.search('window.getStatisticsService = (.*?)}catch', raw_html)
raw_json = match.group(1)
result2 = json.loads(raw_json, encoding='utf8')

In [8]:
result2

{'id': 1,
 'createTime': 1579537899000,
 'modifyTime': 1580658390000,
 'infectSource': '野生动物，可能为中华菊头蝠',
 'passWay': '经呼吸道飞沫传播，亦可通过接触传播，存在粪-口传播可能性',
 'imgUrl': 'https://img1.dxycdn.com/2020/0201/450/3394153392393266839-135.png',
 'dailyPic': 'https://img1.dxycdn.com/2020/0202/725/3394327332126027029-135.png',
 'summary': '',
 'deleted': False,
 'countRemark': '',
 'confirmedCount': 14490,
 'suspectedCount': 19544,
 'curedCount': 434,
 'deadCount': 304,
 'seriousCount': 0,
 'suspectedIncr': 0,
 'confirmedIncr': 0,
 'curedIncr': 0,
 'deadIncr': 0,
 'seriousIncr': 0,
 'virus': '新型冠状病毒 2019-nCoV',
 'remark1': '易感人群：人群普遍易感。老年人及有基础疾病者感染后病情较重，儿童及婴幼儿也有发病',
 'remark2': '潜伏期：一般为 3～7 天，最长不超过 14 天，潜伏期内存在传染性',
 'remark3': '',
 'remark4': '',
 'remark5': '',
 'generalRemark': '疑似病例数来自国家卫健委数据，目前为全国数据，未分省市自治区等',
 'abroadRemark': '',
 'marquee': [{'id': 8,
   'marqueeLabel': '日报',
   'marqueeContent': ' 七日内治愈人数首次超越死亡人数',
   'marqueeLink': 'https://mama.dxy.com/japi/platform/200720055?index=20200202'}]}

#### __Table 1: Summary Dataframe__

In [24]:
df_summary = pd.DataFrame.from_dict([result2])
# df_summary["total"] = "t"
df_summary = df_summary[["confirmedCount","suspectedCount","curedCount","deadCount"]]
df_summary=df_summary.T.reset_index()
df_summary=df_summary.rename(columns={"index": "Count", 0: "Total"})

#### __Table 1: Summary Dataframe(Database)__

In [41]:
#example:
#group_export.to_sql(con=engine, name=example_table, if_exists='replace', 
#                     flavor='mysql', index=False)

# with engine.connect() as con:
#     con.execute('ALTER TABLE `example_table` ADD PRIMARY KEY (`ID_column`);')

In [25]:
# export df to sqlite
engine = create_engine('sqlite:///wuhan_pneumonia.sqlite')
df_summary.to_sql('summary', engine, index=False, if_exists='replace')

In [26]:
# Check if table in sqlite
df_summarydb = pd.read_sql_query('SELECT * FROM summary',engine)
df_summarydb

Unnamed: 0,Count,Total
0,confirmedCount,14490
1,suspectedCount,19544
2,curedCount,434
3,deadCount,304


## Part 2: Get the Province Data

In [59]:
# use regular expression to get the html infomation
raw_html = requests.get(url).content.decode('utf8')
# get the json info within html code
match = re.search('getListByCountryTypeService1 = (.*?)}catch', raw_html)
raw_json = match.group(1)
result3 = json.loads(raw_json, encoding='utf8')

In [60]:
result3[0]

{'id': 2980,
 'createTime': 1580576608000,
 'modifyTime': 1580631460000,
 'tags': '',
 'countryType': 1,
 'continents': '',
 'provinceId': '999',
 'provinceName': '待明确地区',
 'provinceShortName': '待明确地区',
 'cityName': '',
 'confirmedCount': 0,
 'suspectedCount': 19543,
 'curedCount': 0,
 'deadCount': 0,
 'comment': '',
 'sort': 0,
 'operator': '',
 'locationId': 0}

In [61]:
df_provinces_original = pd.DataFrame.from_dict(result3)
df_provinces_original.info()
print(df_provinces_original.columns)
df_provinces_original.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 18 columns):
id                   35 non-null int64
createTime           35 non-null int64
modifyTime           35 non-null int64
tags                 35 non-null object
countryType          35 non-null int64
continents           35 non-null object
provinceId           35 non-null object
provinceName         35 non-null object
provinceShortName    35 non-null object
cityName             35 non-null object
confirmedCount       35 non-null int64
suspectedCount       35 non-null int64
curedCount           35 non-null int64
deadCount            35 non-null int64
comment              35 non-null object
sort                 35 non-null int64
operator             35 non-null object
locationId           35 non-null int64
dtypes: int64(10), object(8)
memory usage: 5.0+ KB
Index(['id', 'createTime', 'modifyTime', 'tags', 'countryType', 'continents',
       'provinceId', 'provinceName', 'provinceShortName', 

Unnamed: 0,id,createTime,modifyTime,tags,countryType,continents,provinceId,provinceName,provinceShortName,cityName,confirmedCount,suspectedCount,curedCount,deadCount,comment,sort,operator,locationId
0,2980,1580576608000,1580631460000,,1,,999,待明确地区,待明确地区,,0,19543,0,0,,0,,0
1,1392,1580256888000,1580341773000,,1,,54,西藏自治区,西藏,,1,0,0,0,,0,zhuotingting,540000
2,30,1579663914000,1580617150000,确诊 2 例,1,,67,澳门,澳门,,8,0,0,0,,23,zyyun,820000
3,24,1579617167000,1580475589000,确诊 1 例,1,,68,台湾,台湾,,10,0,0,0,,27,huanshi,710000
4,182,1579860584000,1580621466000,,1,,63,青海省,青海,,11,0,0,0,待明确地区：确诊 2,40,xuyt,630000


In [62]:
# remove the last 2 rows of the dataframe
df_provinces_r = df_provinces_original.drop(df_provinces_original.head(1).index)
df_provinces_t = df_provinces_r[['provinceId','provinceShortName','confirmedCount', 'suspectedCount', 'curedCount', 'deadCount','locationId']]
df_provinces_t.head()

Unnamed: 0,provinceId,provinceShortName,confirmedCount,suspectedCount,curedCount,deadCount,locationId
1,54,西藏,1,0,0,0,540000
2,67,澳门,8,0,0,0,820000
3,68,台湾,10,0,0,0,710000
4,63,青海,11,0,0,0,630000
5,66,香港,14,0,0,0,810000


In [63]:
translator = Translator()
df_provinces_t["Provinces"] = df_provinces_t["provinceShortName"].map(lambda x: translator.translate(x, src="zh-CN", dest="en").text)
# Add the new translation column to the data frame and check if there is any error
df_provinces_t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,provinceId,provinceShortName,confirmedCount,suspectedCount,curedCount,deadCount,locationId,Provinces
1,54,西藏,1,0,0,0,540000,Tibet
2,67,澳门,8,0,0,0,820000,Macao
3,68,台湾,10,0,0,0,710000,Taiwan
4,63,青海,11,0,0,0,630000,Qinghai
5,66,香港,14,0,0,0,810000,Hong Kong
6,65,新疆,21,0,0,0,650000,Xinjiang
7,22,吉林,23,0,1,0,220000,Jilin
8,15,内蒙古,27,0,1,0,150000,Inner Mongolia
9,64,宁夏,28,0,0,0,640000,Ningxia
10,52,贵州,38,0,2,0,520000,Guizhou


In [65]:
# Since 陕西 and 山西 have the same translation, change 陕西 as Shaanxi
#df.iloc[0, df.columns.get_loc('col2')] = 100
df_provinces_t.iloc[12,df_provinces_t.columns.get_loc('Provinces') ] = 'Shaanxi'
# drop the Chinese Provinces names column
df_provinces_english=df_provinces_t.drop(columns='provinceShortName')

In [66]:
# remove the extra province in province name if the translation is not accurate
new = [i.replace(" Province","") for i in df_provinces_english["Provinces"]]

df_provinces_m=df_provinces_english.replace(list(df_provinces_t["Provinces"]),new)
df_provinces_m.head()

Unnamed: 0,provinceId,confirmedCount,suspectedCount,curedCount,deadCount,locationId,Provinces
1,54,1,0,0,0,540000,Tibet
2,67,8,0,0,0,820000,Macao
3,68,10,0,0,0,710000,Taiwan
4,63,11,0,0,0,630000,Qinghai
5,66,14,0,0,0,810000,Hong Kong


In [67]:
# Read the China provinces coordinates .csv file
df_China_coordinates = pd.read_csv("data/China.csv")
df_China_coordinates.Provinces

0           Shanghai
1            Beijing
2          Guangdong
3              Hubei
4            Tianjin
5          Chongqing
6           Liaoning
7            Sichuan
8            Shaanxi
9            Guangxi
10           Jiangsu
11           Guizhou
12      Heilongjiang
13            Fujian
14             Jilin
15          Shandong
16          Zhejiang
17            Yunnan
18            Shanxi
19             Henan
20             Hunan
21             Gansu
22             Hebei
23           Jiangxi
24          Xinjiang
25            Hainan
26    Inner Mongolia
27             Anhui
28           Qinghai
29           Ningxia
30             Tibet
Name: Provinces, dtype: object

In [17]:
# Since the coordinates does not have Macao, Taiwan, HongKong we add these to present great China area
listOfSeries = [pd.Series(['Macao', 'China', 22.1987, 113.5439], index=df_China_coordinates.columns ) ,
                pd.Series(['Taiwan', 'China', 23.6978, 120.9605], index=df_China_coordinates.columns) ,
                pd.Series(['Hong Kong', 'China', 22.3193, 114.1694], index=df_China_coordinates.columns) ]

In [18]:
df_great_China = df_China_coordinates.append(listOfSeries , ignore_index=True)

#### __Table 2: Great China Data__

In [68]:
# Merge the coordinates with dataframe for future use
df_China_provinces = pd.merge(df_provinces_m, df_great_China, how='outer', on = ["Provinces"])
df_China_provinces.columns
new_order = ['provinceId', 'Provinces','confirmedCount', 'suspectedCount', 'curedCount','deadCount', 'country', 'lat', 'lng', 'locationId']
df_China_provinces=df_China_provinces[new_order]
df_China_provinces

Unnamed: 0,provinceId,Provinces,confirmedCount,suspectedCount,curedCount,deadCount,country,lat,lng,locationId
0,54,Tibet,1,0,0,0,China,29.65,91.1,540000
1,67,Macao,8,0,0,0,China,22.1987,113.5439,820000
2,68,Taiwan,10,0,0,0,China,23.6978,120.9605,710000
3,63,Qinghai,11,0,0,0,China,36.625541,101.75739,630000
4,66,Hong Kong,14,0,0,0,China,22.3193,114.1694,810000
5,65,Xinjiang,21,0,0,0,China,43.807347,87.630506,650000
6,22,Jilin,23,0,1,0,China,43.88,125.322778,220000
7,15,Inner Mongolia,27,0,1,0,China,40.652222,109.822222,150000
8,64,Ningxia,28,0,0,0,China,38.468056,106.273056,640000
9,52,Guizhou,38,0,2,0,China,26.583333,106.716667,520000


#### __Table 2: Great China Data Table (Database)__

In [70]:
df_China_provinces.to_sql('Great_China', engine, index=False,if_exists='replace')

In [71]:
# Check if table in sqlite
df_Great_China_db = pd.read_sql_query('SELECT * FROM Great_China',engine)
df_Great_China_db.head()

Unnamed: 0,provinceId,Provinces,confirmedCount,suspectedCount,curedCount,deadCount,country,lat,lng,locationId
0,54,Tibet,1,0,0,0,China,29.65,91.1,540000
1,67,Macao,8,0,0,0,China,22.1987,113.5439,820000
2,68,Taiwan,10,0,0,0,China,23.6978,120.9605,710000
3,63,Qinghai,11,0,0,0,China,36.625541,101.75739,630000
4,66,Hong Kong,14,0,0,0,China,22.3193,114.1694,810000


## Part 3: Get the World Data

In [22]:
# use regular expression to get the html infomation
raw_html = requests.get(url).content.decode('utf8')
# get the json info within html code
match = re.search('getListByCountryTypeService2 = (.*?)}catch', raw_html)
raw_json = match.group(1)
result4 = json.loads(raw_json, encoding='utf8')

In [23]:
result4[0]

{'id': 953,
 'createTime': 1580027704000,
 'modifyTime': 1580561200000,
 'tags': '',
 'countryType': 2,
 'continents': '亚洲',
 'provinceId': '6',
 'provinceName': '日本',
 'provinceShortName': '',
 'cityName': '',
 'confirmedCount': 20,
 'suspectedCount': 0,
 'curedCount': 1,
 'deadCount': 0,
 'comment': '',
 'sort': 0,
 'operator': 'xuyt',
 'locationId': 951002}

In [24]:
df_world_original = pd.DataFrame.from_dict(result4)
df_world_original.info()
print(df_world_original.columns)
df_world_original.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 18 columns):
id                   23 non-null int64
createTime           23 non-null int64
modifyTime           23 non-null int64
tags                 23 non-null object
countryType          23 non-null int64
continents           23 non-null object
provinceId           23 non-null object
provinceName         23 non-null object
provinceShortName    23 non-null object
cityName             23 non-null object
confirmedCount       23 non-null int64
suspectedCount       23 non-null int64
curedCount           23 non-null int64
deadCount            23 non-null int64
comment              23 non-null object
sort                 23 non-null int64
operator             23 non-null object
locationId           23 non-null int64
dtypes: int64(10), object(8)
memory usage: 3.4+ KB
Index(['id', 'createTime', 'modifyTime', 'tags', 'countryType', 'continents',
       'provinceId', 'provinceName', 'provinceShortName', 

Unnamed: 0,id,createTime,modifyTime,tags,countryType,continents,provinceId,provinceName,provinceShortName,cityName,confirmedCount,suspectedCount,curedCount,deadCount,comment,sort,operator,locationId
0,953,1580027704000,1580561200000,,2,亚洲,6,日本,,,20,0,1,0,,0,xuyt,951002
1,949,1580027637000,1580495765000,,2,亚洲,2,泰国,,,19,0,5,0,,0,xuyt,952010
2,950,1580027655000,1580619404000,,2,亚洲,3,新加坡,,,18,0,0,0,,0,xuyt,952009
3,954,1580027721000,1580609979000,,2,亚洲,7,韩国,,,15,0,0,0,,0,xuyt,951004
4,958,1580027777000,1580620070000,,2,大洋洲,10,澳大利亚,,,12,0,2,0,,0,xuyt,990001


In [25]:
df_world_t = df_world_original[['id','provinceName','continents',  'confirmedCount','suspectedCount', 'curedCount', 'deadCount','locationId']]
df_world_t.head()

Unnamed: 0,id,provinceName,continents,suspectedCount,curedCount,deadCount,locationId
0,953,日本,亚洲,0,1,0,951002
1,949,泰国,亚洲,0,5,0,952010
2,950,新加坡,亚洲,0,0,0,952009
3,954,韩国,亚洲,0,0,0,951004
4,958,澳大利亚,大洋洲,0,2,0,990001


In [26]:
translator = Translator()
df_world_t["Country"] = df_world_t["provinceName"].map(lambda x: translator.translate(x, src="zh-CN", dest="en").text)
# Add the new translation column to the data frame and check if there is any error
df_world_t["Continent"] = df_world_t["continents"].map(lambda x: translator.translate(x, src="zh-CN", dest="en").text)
df_world_t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,id,provinceName,continents,suspectedCount,curedCount,deadCount,locationId,Country,Continent
0,953,日本,亚洲,0,1,0,951002,Japan,Asia
1,949,泰国,亚洲,0,5,0,952010,Thailand,Asia
2,950,新加坡,亚洲,0,0,0,952009,Singapore,Asia
3,954,韩国,亚洲,0,0,0,951004,Korea,Asia
4,958,澳大利亚,大洋洲,0,2,0,990001,Australia,Oceania
5,951,马来西亚,亚洲,0,0,0,952007,Malaysia,Asia
6,955,美国,北美洲,0,0,0,971002,United States,North America
7,1047,德国,欧洲,0,0,0,963003,Germany,Europe
8,952,法国,欧洲,0,0,0,961002,France,Europe
9,956,越南,亚洲,0,1,0,952011,Vietnam,Asia


In [27]:
df_world_english=df_world_t.drop(columns=['provinceName','continents'])
df_world_english.iloc[3,df_world_english.columns.get_loc('Country') ] = 'South Korea'

In [28]:
df_world_coordinates = pd.read_csv("data/world.csv")
df_world_coordinates

Unnamed: 0,country,latitude,longitude,name
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.939110,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla
...,...,...,...,...
240,YE,15.552727,48.516388,Yemen
241,YT,-12.827500,45.166244,Mayotte
242,ZA,-30.559482,22.937506,South Africa
243,ZM,-13.133897,27.849332,Zambia


In [29]:
df_world_coordinates=df_world_coordinates.rename(columns={"country": "Countryshort", "name": "Country"})
df_world_coordinates.head()

Unnamed: 0,Countryshort,latitude,longitude,Country
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.93911,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla


In [30]:
df_world_c = pd.merge(df_world_english, df_world_coordinates, how='inner', on = ["Country"])
df_world_c

Unnamed: 0,id,suspectedCount,curedCount,deadCount,locationId,Country,Continent,Countryshort,latitude,longitude
0,953,0,1,0,951002,Japan,Asia,JP,36.204824,138.252924
1,949,0,5,0,952010,Thailand,Asia,TH,15.870032,100.992541
2,950,0,0,0,952009,Singapore,Asia,SG,1.352083,103.819836
3,954,0,0,0,951004,South Korea,Asia,KR,35.907757,127.766922
4,958,0,2,0,990001,Australia,Oceania,AU,-25.274398,133.775136
5,951,0,0,0,952007,Malaysia,Asia,MY,4.210484,101.975766
6,955,0,0,0,971002,United States,North America,US,37.09024,-95.712891
7,1047,0,0,0,963003,Germany,Europe,DE,51.165691,10.451526
8,952,0,0,0,961002,France,Europe,FR,46.227638,2.213749
9,956,0,1,0,952011,Vietnam,Asia,VN,14.058324,108.277199


#### __Table 3: World Data__

In [31]:
df_world_c.columns
new_order2 = ['id', 'Country', 'Continent', 'Countryshort','suspectedCount', 'curedCount', 'deadCount','latitude', 'longitude','locationId']
df_world=df_world_c[new_order2]
df_world

Unnamed: 0,id,Country,Continent,Countryshort,suspectedCount,curedCount,deadCount,latitude,longitude,locationId
0,953,Japan,Asia,JP,0,1,0,36.204824,138.252924,951002
1,949,Thailand,Asia,TH,0,5,0,15.870032,100.992541,952010
2,950,Singapore,Asia,SG,0,0,0,1.352083,103.819836,952009
3,954,South Korea,Asia,KR,0,0,0,35.907757,127.766922,951004
4,958,Australia,Oceania,AU,0,2,0,-25.274398,133.775136,990001
5,951,Malaysia,Asia,MY,0,0,0,4.210484,101.975766,952007
6,955,United States,North America,US,0,0,0,37.09024,-95.712891,971002
7,1047,Germany,Europe,DE,0,0,0,51.165691,10.451526,963003
8,952,France,Europe,FR,0,0,0,46.227638,2.213749,961002
9,956,Vietnam,Asia,VN,0,1,0,14.058324,108.277199,952011


#### __Table 3: World Data Table (Database)__

In [32]:
df_world.to_sql('world', engine, index=False, if_exists='replace')

In [33]:
# Check if table in sqlite
df_worlddb = pd.read_sql_query('SELECT * FROM world',engine)
df_worlddb.head()

Unnamed: 0,id,Country,Continent,Countryshort,suspectedCount,curedCount,deadCount,latitude,longitude,locationId
0,953,Japan,Asia,JP,0,1,0,36.204824,138.252924,951002
1,949,Thailand,Asia,TH,0,5,0,15.870032,100.992541,952010
2,950,Singapore,Asia,SG,0,0,0,1.352083,103.819836,952009
3,954,South Korea,Asia,KR,0,0,0,35.907757,127.766922,951004
4,958,Australia,Oceania,AU,0,2,0,-25.274398,133.775136,990001
