# Library

In [9]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os

# Load data

In [10]:
df = pd.read_csv('../data/2022-03-08-unclean.csv')
df.head()

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Population,Continent
0,China,111520,325,4636,,102832,110,4052,9,77,3,160000000,111163,1439323776,Asia
1,USA,81024903,33615,988208,1299.0,55221462,202659,24815233,5421,242395,2956,960233929,2872645,334268263,North America
2,India,42975883,4575,515386,145.0,42413566,7416,46931,8944,30635,367,774310567,551964,1402828373,Asia
3,Brazil,29144964,75495,652936,518.0,27344949,165757,1147079,8318,135494,3035,63776166,296493,215101451,South America
4,France,23164872,93050,139618,167.0,21836839,98559,1188415,2484,353573,2131,246629975,3764393,65516532,Europe


# Check data type

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Country,Other     218 non-null    object
 1   TotalCases        218 non-null    object
 2   NewCases          167 non-null    object
 3   TotalDeaths       218 non-null    object
 4   NewDeaths         113 non-null    object
 5   TotalRecovered    208 non-null    object
 6   NewRecovered      146 non-null    object
 7   ActiveCases       208 non-null    object
 8   Serious,Critical  162 non-null    object
 9   Tot Cases/1M pop  218 non-null    object
 10  Deaths/1M pop     212 non-null    object
 11  TotalTests        209 non-null    object
 12  Tests/1M pop      209 non-null    object
 13  Population        218 non-null    object
 14  Continent         218 non-null    object
dtypes: object(15)
memory usage: 25.7+ KB


# Strip spaces

In [12]:
df['Country,Other']=df['Country,Other'].str.strip()
df['Continent']=df['Continent'].str.strip()

# Convert number

In [13]:
def convertNumber(x):
    if (pd.isnull(x)): return x
    x = str(x)
    x = x.replace("+","")
    x = x.replace(",","")
    x = x.strip()
    try: return float(x)
    except: return np.nan

numColumns = df.columns.drop(["Country,Other","Continent"])
for col in numColumns:
    df[col]=df[col].apply(convertNumber)

df.head()

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Population,Continent
0,China,111520.0,325.0,4636.0,,102832.0,110.0,4052.0,9.0,77.0,3.0,160000000.0,111163.0,1439324000.0,Asia
1,USA,81024903.0,33615.0,988208.0,1299.0,55221462.0,202659.0,24815233.0,5421.0,242395.0,2956.0,960233929.0,2872645.0,334268300.0,North America
2,India,42975883.0,4575.0,515386.0,145.0,42413566.0,7416.0,46931.0,8944.0,30635.0,367.0,774310567.0,551964.0,1402828000.0,Asia
3,Brazil,29144964.0,75495.0,652936.0,518.0,27344949.0,165757.0,1147079.0,8318.0,135494.0,3035.0,63776166.0,296493.0,215101500.0,South America
4,France,23164872.0,93050.0,139618.0,167.0,21836839.0,98559.0,1188415.0,2484.0,353573.0,2131.0,246629975.0,3764393.0,65516530.0,Europe


# Check data type again

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country,Other     218 non-null    object 
 1   TotalCases        218 non-null    float64
 2   NewCases          167 non-null    float64
 3   TotalDeaths       212 non-null    float64
 4   NewDeaths         113 non-null    float64
 5   TotalRecovered    208 non-null    float64
 6   NewRecovered      146 non-null    float64
 7   ActiveCases       208 non-null    float64
 8   Serious,Critical  162 non-null    float64
 9   Tot Cases/1M pop  218 non-null    float64
 10  Deaths/1M pop     212 non-null    float64
 11  TotalTests        209 non-null    float64
 12  Tests/1M pop      209 non-null    float64
 13  Population        218 non-null    float64
 14  Continent         218 non-null    object 
dtypes: float64(13), object(2)
memory usage: 25.7+ KB


In [15]:
def cleanData(filename):
    df = pd.read_csv(filename)
    df['Country,Other']=df['Country,Other'].str.strip()

    numColumns = df.columns.drop(["Country,Other","Continent"])
    for col in numColumns:
        df[col]=df[col].apply(convertNumber)
    return df

files = [unclean for unclean in os.listdir('../data') if unclean.endswith("-unclean.csv")]

for unclean in files:
    df = cleanData('../data/'+unclean)
    df.to_csv('../data/'+unclean.replace('unclean','clean'),index=False)

# Generating time-series data

In [16]:
files = [clean for clean in os.listdir('../data') if clean.endswith("-clean.csv")]
df_all = pd.DataFrame()
for path in files:
    crawlDate=path[:path.rfind('-')]
    df_one = pd.read_csv(os.path.join('../data',path))
    df_one['Date'] = pd.to_datetime(crawlDate)
    df_all = df_all.append(df_one)
df_all.index = range(1,len(df_all)+1)
print(df_all.head())
df_all.to_csv('../data/time_series.csv',index=False)

  Country,Other  TotalCases  NewCases  TotalDeaths  NewDeaths  TotalRecovered  \
1         China    111520.0     325.0       4636.0        NaN        102832.0   
2           USA  81024903.0   33615.0     988208.0     1299.0      55221462.0   
3         India  42975883.0    4575.0     515386.0      145.0      42413566.0   
4        Brazil  29144964.0   75495.0     652936.0      518.0      27344949.0   
5        France  23164872.0   93050.0     139618.0      167.0      21836839.0   

   NewRecovered  ActiveCases  Serious,Critical  Tot Cases/1M pop  \
1         110.0       4052.0               9.0              77.0   
2      202659.0   24815233.0            5421.0          242395.0   
3        7416.0      46931.0            8944.0           30635.0   
4      165757.0    1147079.0            8318.0          135494.0   
5       98559.0    1188415.0            2484.0          353573.0   

   Deaths/1M pop   TotalTests  Tests/1M pop    Population      Continent  \
1            3.0  160000000.