## Phase 2 Data Processing

In [6]:
# import packages

# data processing
import pandas as pd
import numpy as np
from datetime import timedelta, datetime


import re

# data visualization
from tabulate import tabulate
import plotly.graph_objs as go
from plotly.graph_objs import Bar, Layout
from plotly import offline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(11.7,8.27)})

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号

# change text color
import colorama
from colorama import Fore, Style

# IPython
from IPython.display import IFrame

from covid_report import getdata,data_processing,covid_visualization

### Data Pre-processing

Time series:

- time_series_covid19_confirmed_US.csv
- time_series_covid19_confirmed_global.csv
- time_series_covid19_deaths_US.csv
- time_series_covid19_deaths_global.csv
- time_series_covid19_recovered_global.csv

cross-section data:

- csse_covid_19_daily_reports
- csse_covid_19_daily_reports_us

In [17]:
from getdata import GET_csse_covid_19_daily_reports,GET_csse_covid_19_time_series,GET_shanghai_data

# get data
latest_data_global,prev_data_global,latest_data_us,prev_data_us = GET_csse_covid_19_daily_reports()
ts_confirmed_us,ts_confirmed_global,ts_deaths_us,ts_deaths_global,ts_recovered_global = GET_csse_covid_19_time_series()
ts_shanghai_covid = GET_shanghai_data(plot=True)  # 这里包含近10天的上海无症状新增趋势！

正在读取【横截面】数据......
读取完毕
正在读取【时间序列】数据......
读取完毕
正在获取并处理【上海】数据（数据来自上海卫健委）......


HTTPError: HTTP Error 403: Forbidden

### Time series Processinng (China)

Global：ts_confirmed_global。

In [None]:
ts_confirmed_global.head()

In [None]:
data1 = ts_confirmed_global.copy() # 不要原地操作，记得copy
# （1）Country/Region筛选出China的所有数据
data1 = data1[data1['Country/Region'] == 'China']
# （2）把Province/State设置成index
data1 = data1.set_index('Province/State')
# （3）'Country/Region','Lat','Long' 去除这三列，我们暂时不要。
data1 = data1.drop(['Country/Region','Lat','Long'],axis=1)
# （4）通过累计数计算新增数，也就是横着看，后一天的减去前一天的

### 1. axis=1 表示横着减
### 2. fillna 是因为后面的减去前一个，那么第一个值就会变成NaN，所以需要填充，我们填充为第一列本来的值即可
### 3. 在填充之后，为了保留原始数据类型，省着你用.astype再去转换，这里用downcast = 'infer'，具体见下方视频！

######## 重要！########
# .clip(lower=0) 是否加在最后面，取决于我们是否负略负增长，就是会存在你算出的difference是负值，这一句代码就会把所有负值换成0。
data1 = data1.diff(axis=1).fillna({data1.columns[0]:data1[data1.columns[0]]},downcast = 'infer') # .clip(lower=0)


# （5）我们按照最后一天(也就是最新一天)的新增从小到大排列，使用sort_values，根据data的最后一列的值，倒序排列即可。
data1 = data1.sort_values(by = data1.columns[-1],ascending=False)
# （6）转置（相当于90度大翻转）
data1 = data1.T
# （7）去除 Unknown 这一列，因为这时候index和column互换了，原先的index是Province/State，这里存在缺失值（估计是台湾？）叫做Unknown，去掉它。
data1 = data1.dropna().drop('Unknown',axis=1)
# （8）时间转换，此时，我们需要把时间用 pd.to_datetime 进行转换！You are almost there! 加油！
data1.index = pd.to_datetime(data1.index)
# （9）去除columns name

data1.columns.name = ''

ts_data_processed = data1
# 现在我们来看看处理好的数据！

# ts_data_processed 
# 解锁！

In [None]:
def ts_process_CHINA(ts_data,clip = False):
    ts_data = ts_data[ts_data['Country/Region'] == 'China']
    # get loc
    loc_data = ts_data[['Province/State','Lat','Long']]
    loc_data = loc_data[loc_data['Province/State'] != 'Unknown']

    ts_data = ts_data.set_index('Province/State').drop(['Country/Region','Lat','Long'],axis=1)
    if clip:
        ts_data = ts_data.diff(axis=1).fillna({ts_data.columns[0]:ts_data[ts_data.columns[0]]},downcast = 'infer').clip(lower=0)
    else:
        ts_data = ts_data.diff(axis=1).fillna({ts_data.columns[0]:ts_data[ts_data.columns[0]]},downcast = 'infer')
    ts_data = ts_data.sort_values(by = ts_data.columns[-1],ascending=False)
    ts_data = ts_data.T.dropna().drop('Unknown',axis=1)
    ts_data.index = pd.to_datetime(ts_data.index)
    ts_data.columns.name = ''
    sorted_provinces = ts_data.columns
    return ts_data,loc_data,sorted_provinces #### loc_data,sorted_provinces 这两个输出先忽略

### USA

In [None]:
ts_confirmed_us.head()

In [None]:
def ts_process_US(ts_data, death = False, clip = False):
    # get loc
    ts_data = ts_data.dropna()
    loc_data = ts_data[['UID','FIPS','Admin2','Province_State','Lat','Long_']]

    ts_data = ts_data.set_index(['Province_State','Admin2']).iloc[:,9:]
    if death:
        population = ts_data.groupby('Province_State')['Population'].sum()
        ts_data = ts_data.drop('Population',axis=1)

    if clip:
        ts_data = ts_data.diff(axis=1).fillna({ts_data.columns[0]:ts_data[ts_data.columns[0]]},downcast = 'infer').clip(lower=0)
    else:
        ts_data = ts_data.diff(axis=1).fillna({ts_data.columns[0]:ts_data[ts_data.columns[0]]},downcast = 'infer')
    ts_data = ts_data.groupby('Province_State').sum()
    ts_data = ts_data.sort_values(by = ts_data.columns[-1],ascending=False)
    ts_data = ts_data.T
    ts_data.index = pd.to_datetime(ts_data.index)
    ts_data.columns.name = ''

    sorted_state = ts_data.columns

    if death:
        return ts_data,loc_data,sorted_state,population
    return ts_data,loc_data,sorted_state

### China & USA cross-section data 

Example: latest_data_global

In [None]:
latest_data_global.head()

In [None]:
def daily_process(daily_data, country = 'China'):
    if country == 'China':
        daily_data = daily_data[(daily_data['Country_Region'] == 'China')&(daily_data['Province_State']!='Unknown')]
    elif country == 'US':
        daily_data = daily_data
    cols_use = ['Province_State','Last_Update','Confirmed','Deaths','Incident_Rate','Case_Fatality_Ratio']

    # sorted_provinces is provided
    daily_data_processed = daily_data[cols_use].set_index(daily_data['Province_State'])
    daily_data_processed.index.name = ""
    daily_data_processed = daily_data_processed.dropna()
    return daily_data_processed

In [None]:
daily_process(latest_data_global, country = 'China')

In [None]:
from data_processing import ts_process_CHINA,ts_process_US,daily_process

In [None]:
ts_confirmed_CHINA_incre, loc_data_CHINA, sorted_provinces = ts_process_CHINA(ts_confirmed_global,clip=False)
ts_deaths_CHINA_incre, _, _ = ts_process_CHINA(ts_deaths_global,clip=False)
ts_recovered_CHINA_incre, _, _ = ts_process_CHINA(ts_recovered_global,clip=False)

In [None]:
ts_confirmed_US_incre,loc_data_us,sorted_state = ts_process_US(ts_confirmed_us,clip=False)
ts_deaths_US_incre,_,_,population = ts_process_US(ts_deaths_us,death = True,clip=False)

In [None]:
latest_data_CHINA = daily_process(latest_data_global, country = 'China')
prev_data_CHINA = daily_process(prev_data_global, country = 'China')
latest_data_US = daily_process(latest_data_us, country = 'US')
prev_data_US = daily_process(prev_data_us, country = 'US')