In [1]:
import numpy as np
import pandas as pd

### Importing Raw Data Set

In [2]:
df = pd.read_csv("CovidData3-18-2021.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3981 entries, 0 to 3980
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   FIPS                 3264 non-null   float64
 1   Admin2               3269 non-null   object 
 2   Province_State       3810 non-null   object 
 3   Country_Region       3981 non-null   object 
 4   Last_Update          3981 non-null   object 
 5   Lat                  3893 non-null   float64
 6   Long_                3893 non-null   float64
 7   Confirmed            3981 non-null   int64  
 8   Deaths               3981 non-null   int64  
 9   Recovered            706 non-null    float64
 10  Active               704 non-null    float64
 11  Combined_Key         3981 non-null   object 
 12  Incident_Rate        3893 non-null   float64
 13  Case_Fatality_Ratio  3940 non-null   float64
dtypes: float64(7), int64(2), object(5)
memory usage: 435.5+ KB


### Removing Unneccesary and Incomplete Info

In [3]:
df = df.drop(columns = ["FIPS","Admin2", "Province_State", "Lat", "Long_", "Combined_Key"])
df = df.rename(columns= {"Country_Region": "Country"})
df = df.dropna()
df

Unnamed: 0,Country,Last_Update,Confirmed,Deaths,Recovered,Active,Incident_Rate,Case_Fatality_Ratio
0,Afghanistan,2021-03-19 05:26:09,56044,2462,49664.0,3918.0,143.967091,4.392977
1,Albania,2021-03-19 05:26:09,119528,2106,83264.0,34158.0,4153.450553,1.761930
2,Algeria,2021-03-19 05:26:09,115842,3051,80347.0,32444.0,264.171596,2.633760
3,Andorra,2021-03-19 05:26:09,11393,113,10904.0,376.0,14745.356889,0.991837
4,Angola,2021-03-19 05:26:09,21558,522,20032.0,1004.0,65.593088,2.421375
...,...,...,...,...,...,...,...,...
3976,Vietnam,2021-03-19 05:26:09,2570,35,2198.0,337.0,2.640269,1.361868
3977,West Bank and Gaza,2021-03-19 05:26:09,218061,2358,193578.0,22125.0,4274.519075,1.081349
3978,Yemen,2021-03-19 05:26:09,3126,723,1520.0,883.0,10.480800,23.128599
3979,Zambia,2021-03-19 05:26:09,85889,1175,82527.0,2187.0,467.195418,1.368045


### Aggregating Data By Country

In [4]:
df = df.groupby("Country").agg({"Last_Update": np.max, "Confirmed" : "sum", "Deaths": "sum", "Recovered": "sum", "Incident_Rate": "mean"})
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 189 entries, Afghanistan to Zimbabwe
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Last_Update    189 non-null    object 
 1   Confirmed      189 non-null    int64  
 2   Deaths         189 non-null    int64  
 3   Recovered      189 non-null    float64
 4   Incident_Rate  189 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 8.9+ KB


In [5]:
df["Active"] = df["Confirmed"] - df["Recovered"] - df["Deaths"]
df["Case_Fatality_Ratio"] = (df["Deaths"]/df["Confirmed"]) * 100
df.head()

Unnamed: 0_level_0,Last_Update,Confirmed,Deaths,Recovered,Incident_Rate,Active,Case_Fatality_Ratio
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Afghanistan,2021-03-19 05:26:09,56044,2462,49664.0,143.967091,3918.0,4.392977
Albania,2021-03-19 05:26:09,119528,2106,83264.0,4153.450553,34158.0,1.76193
Algeria,2021-03-19 05:26:09,115842,3051,80347.0,264.171596,32444.0,2.63376
Andorra,2021-03-19 05:26:09,11393,113,10904.0,14745.356889,376.0,0.991837
Angola,2021-03-19 05:26:09,21558,522,20032.0,65.593088,1004.0,2.421375


## Adding Continent Data

In [42]:
df2 = pd.read_csv("countryContinent.csv")

for index, value in df.iterrows():
    hold = df2.loc[df2["country"] == str(index),"continent"]

### Converting to .csv

In [81]:
df.to_csv("covid-19.csv", index = True)

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 189 entries, Afghanistan to Zimbabwe
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Last_Update          189 non-null    object 
 1   Confirmed            189 non-null    int64  
 2   Deaths               189 non-null    int64  
 3   Recovered            189 non-null    float64
 4   Incident_Rate        189 non-null    float64
 5   Active               189 non-null    float64
 6   Case_Fatality_Ratio  189 non-null    float64
dtypes: float64(4), int64(2), object(1)
memory usage: 11.8+ KB
