In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_country = pd.read_csv('country_wise_latest.csv')
df_covid = pd.read_csv('covid_19_clean_complete.csv')
df_day = pd.read_csv('day_wise.csv')
df_fgroup = pd.read_csv('full_grouped.csv')
df_world = pd.read_csv('worldometer_data.csv')

In [3]:
all_dfs = [df_country, df_covid, df_day, df_fgroup, df_world]
for df in all_dfs:
    print(f'-------{df.head()}----------\n\n\n\n\n')

-------  Country/Region  Confirmed  Deaths  Recovered  Active  New cases  New deaths  \
0    Afghanistan      36263    1269      25198    9796        106          10   
1        Albania       4880     144       2745    1991        117           6   
2        Algeria      27973    1163      18837    7973        616           8   
3        Andorra        907      52        803      52         10           0   
4         Angola        950      41        242     667         18           1   

   New recovered  Deaths / 100 Cases  Recovered / 100 Cases  \
0             18                3.50                  69.49   
1             63                2.95                  56.25   
2            749                4.16                  67.34   
3              0                5.73                  88.53   
4              0                4.32                  25.47   

   Deaths / 100 Recovered  Confirmed last week  1 week change  \
0                    5.04                35526            737 

### Data Cleaning

In [4]:
df.shape

(209, 16)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country/Region    209 non-null    object 
 1   Continent         208 non-null    object 
 2   Population        208 non-null    float64
 3   TotalCases        209 non-null    int64  
 4   NewCases          4 non-null      float64
 5   TotalDeaths       188 non-null    float64
 6   NewDeaths         3 non-null      float64
 7   TotalRecovered    205 non-null    float64
 8   NewRecovered      3 non-null      float64
 9   ActiveCases       205 non-null    float64
 10  Serious,Critical  122 non-null    float64
 11  Tot Cases/1M pop  208 non-null    float64
 12  Deaths/1M pop     187 non-null    float64
 13  TotalTests        191 non-null    float64
 14  Tests/1M pop      191 non-null    float64
 15  WHO Region        184 non-null    object 
dtypes: float64(12), int64(1), object(3)
memory u

In [6]:
df_fgroup.describe()

Unnamed: 0,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered
count,35156.0,35156.0,35156.0,35156.0,35156.0,35156.0,35156.0
mean,23566.63,1234.068239,11048.13,11284.43,469.36375,18.603339,269.315593
std,149981.8,7437.238354,64546.4,89971.49,3005.86754,115.706351,2068.063852
min,0.0,0.0,0.0,-2.0,0.0,-1918.0,-16298.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,250.0,4.0,33.0,85.0,2.0,0.0,0.0
75%,3640.25,78.25,1286.25,1454.0,75.0,1.0,20.0
max,4290259.0,148011.0,1846641.0,2816444.0,77255.0,3887.0,140050.0


In [7]:
df.head()

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198100.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,212710700.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
2,India,Asia,1381345000.0,2025409,,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
3,Russia,Europe,145940900.0,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
4,South Africa,Africa,59381570.0,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa


### Checking missing/null values

In [8]:
df_fgroup.isnull().sum()

Date              0
Country/Region    0
Confirmed         0
Deaths            0
Recovered         0
Active            0
New cases         0
New deaths        0
New recovered     0
WHO Region        0
dtype: int64

In [9]:
df_country.isnull().sum()

Country/Region            0
Confirmed                 0
Deaths                    0
Recovered                 0
Active                    0
New cases                 0
New deaths                0
New recovered             0
Deaths / 100 Cases        0
Recovered / 100 Cases     0
Deaths / 100 Recovered    0
Confirmed last week       0
1 week change             0
1 week % increase         0
WHO Region                0
dtype: int64

In [10]:
df_covid.isnull().sum()

Province/State    34404
Country/Region        0
Lat                   0
Long                  0
Date                  0
Confirmed             0
Deaths                0
Recovered             0
Active                0
WHO Region            0
dtype: int64

In [11]:
df_covid['Province/State'] = df_covid['Province/State'].fillna(' ')
df_covid.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active,WHO Region
0,,Afghanistan,33.93911,67.709953,2020-01-22,0,0,0,0,Eastern Mediterranean
1,,Albania,41.1533,20.1683,2020-01-22,0,0,0,0,Europe
2,,Algeria,28.0339,1.6596,2020-01-22,0,0,0,0,Africa
3,,Andorra,42.5063,1.5218,2020-01-22,0,0,0,0,Europe
4,,Angola,-11.2027,17.8739,2020-01-22,0,0,0,0,Africa


In [12]:
df_world.isnull().sum()

Country/Region        0
Continent             1
Population            1
TotalCases            0
NewCases            205
TotalDeaths          21
NewDeaths           206
TotalRecovered        4
NewRecovered        206
ActiveCases           4
Serious,Critical     87
Tot Cases/1M pop      1
Deaths/1M pop        22
TotalTests           18
Tests/1M pop         18
WHO Region           25
dtype: int64

In [13]:
happening = ['NewCases','NewDeaths','NewRecovered','Serious,Critical','TotalDeaths','Tests/1M pop','Deaths/1M pop', 'TotalTests','ActiveCases','TotalRecovered']
df_world[happening] = df_world[happening].fillna(0)
#handling the region names missing values by filling them with ''
df_world['WHO Region'] = df_world['WHO Region'].fillna('')
df_world.head()

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198100.0,5032179,0.0,162804.0,0.0,2576668.0,0.0,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,212710700.0,2917562,0.0,98644.0,0.0,2047660.0,0.0,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
2,India,Asia,1381345000.0,2025409,0.0,41638.0,0.0,1377384.0,0.0,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
3,Russia,Europe,145940900.0,871894,0.0,14606.0,0.0,676357.0,0.0,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
4,South Africa,Africa,59381570.0,538184,0.0,9604.0,0.0,387316.0,0.0,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa


In [14]:
df_day.isnull().sum()

Date                      0
Confirmed                 0
Deaths                    0
Recovered                 0
Active                    0
New cases                 0
New deaths                0
New recovered             0
Deaths / 100 Cases        0
Recovered / 100 Cases     0
Deaths / 100 Recovered    0
No. of countries          0
dtype: int64

### Checking the duplicates

In [15]:
df_covid.duplicated().sum()

np.int64(0)

In [16]:
df_country.duplicated().sum()

np.int64(0)

In [17]:
df_day.duplicated().sum()

np.int64(0)

In [18]:
df_fgroup.duplicated().sum()

np.int64(0)

In [19]:
df_world.duplicated().sum()

np.int64(0)

No duplicate rows

In [20]:
#finding the common columns in between the two datasets 
covid_columns = set(df_covid.columns)
country_columns = set(df_country.columns)
common_columns = covid_columns & country_columns
print(common_columns)

{'Active', 'WHO Region', 'Country/Region', 'Recovered', 'Deaths', 'Confirmed'}


In [21]:
commons = ['WHO Region', 'Recovered', 'Country/Region', 'Deaths', 'Active', 'Confirmed']
merged_covcountry = pd.merge(df_covid, df_country, on=commons, how='inner')
merged_covcountry.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active,WHO Region,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase
0,,Timor-Leste,-8.874217,125.727539,2020-04-24,24,0,0,24,South-East Asia,0,0,0,0.0,0.0,0.0,24,0,0.0
1,,Timor-Leste,-8.874217,125.727539,2020-04-25,24,0,0,24,South-East Asia,0,0,0,0.0,0.0,0.0,24,0,0.0
2,,Timor-Leste,-8.874217,125.727539,2020-04-26,24,0,0,24,South-East Asia,0,0,0,0.0,0.0,0.0,24,0,0.0
3,,Timor-Leste,-8.874217,125.727539,2020-04-27,24,0,0,24,South-East Asia,0,0,0,0.0,0.0,0.0,24,0,0.0
4,,Timor-Leste,-8.874217,125.727539,2020-04-28,24,0,0,24,South-East Asia,0,0,0,0.0,0.0,0.0,24,0,0.0


Khám phá các xu hướng toàn cầu và khu vực về các trường hợp COVID-19, bao gồm các trường hợp được xác nhận, phục hồi, tử vong và các trường hợp đang hoạt động. Điều này giúp nhóm hiểu được quy mô và tác động chung của đại dịch.

Bộ dữ liệu df_country cung cấp thông tin có giá trị về nhân khẩu học và cơ sở hạ tầng chăm sóc sức khỏe của các quốc gia. Dữ liệu này có thể được sử dụng để liên hệ các nguồn lực chăm sóc sức khỏe với kết quả của đại dịch.

Sử dụng bộ dữ liệu df_day để tiến hành phân tích chuỗi thời gian, theo dõi tiến trình của các trường hợp, phục hồi và tử vong theo thời gian. Cho phép xác định các giai đoạn tăng trưởng và phục hồi nhanh chóng.

Nhóm các quốc gia theo khu vực của WHO và phân tích các số liệu chính như tỷ lệ tử vong và tỷ lệ phục hồi. Điều này cung cấp thông tin chi tiết về các biến thể theo khu vực trong quản lý đại dịch.

Bộ dữ liệu df_fgroup cho phép phân tích so sánh bằng cách phân loại các quốc gia thành các nhóm cụ thể. Điều này cho phép so sánh tác động của đại dịch đối với các quốc gia tương tự.