### Import libraries

In [4]:
import numpy as np
import pandas as pd
import requests
import urllib.parse

## 1.Data explorations
1. How many rows (observations) and columns (attributes) does the data have? 
2. Are there any duplicate rows in the data?
3. What do the columns in the data mean?
4. What data type does each column currently have? (.dtype)numeric datatype


### 1.1 How many rows (observations) and columns (attributes) does the data have for each day?

In [5]:
# Read
df1 = pd.read_csv('../data/covid_3_2_2023.csv')
df1.shape

(239, 21)

#### Because we need data for one week, we need to combine the data from each day into one dataframe.
- Read in the data for each day
- Add a column to the dataframe that indicates the day of the week
- Combine the data into one dataframe

In [6]:
# Read all data
df1 = pd.read_csv('../data/covid_3_2_2023.csv')
df2 = pd.read_csv('../data/covid_3_3_2023.csv')
df3 = pd.read_csv('../data/covid_3_4_2023.csv')
df4 = pd.read_csv('../data/covid_3_5_2023.csv')
df5 = pd.read_csv('../data/covid_3_6_2023.csv')
df6 = pd.read_csv('../data/covid_3_7_2023.csv')
df7 = pd.read_csv('../data/covid_3_8_2023.csv')
df8 = pd.read_csv('../data/covid_3_9_2023.csv')
df9 = pd.read_csv('../data/covid_3_10_2023.csv')

In [7]:
# Add Date column 
def add_date(df, date):
    df['Date'] = pd.to_datetime(date, format='%Y-%m-%d')
    
    # move the date column to second position
    cols = df.columns.tolist()
    cols = cols[:1] + cols[-1:] + cols[1:-1]
    return df[cols]

df1 = add_date(df1, '2023-03-02')
df2 = add_date(df2, '2023-03-03')
df3 = add_date(df3, '2023-03-04')
df4 = add_date(df4, '2023-03-05')
df5 = add_date(df5, '2023-03-06')
df6 = add_date(df6, '2023-03-07')
df7 = add_date(df7, '2023-03-08')
df8 = add_date(df8, '2023-03-09')
df9 = add_date(df9, '2023-03-10')

In [8]:
# Concatenate all dataframes
new_df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9], axis=0, ignore_index=True)

# Sort rows by country name and date
new_df = new_df.sort_values(by=['Country,Other', 'Date'])

In [9]:
# Shape of new dataframe 
new_df.shape

(2151, 22)

In [11]:
# save the new dataframe
new_df.to_csv('../data/combined_data.csv', index=False)
new_df.head(10)

Unnamed: 0,"Country,Other",Date,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",...,TotalTests,Tests/\r\n1M pop,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl,New Cases/1M pop,New Deaths/1M pop,Active Cases/1M pop
121,Afghanistan,2023-03-02,209359,19,7896,,191166,10,10297,45,...,1201475.0,29481.0,40754388.0,Asia,195.0,5161.0,34.0,0.5,,253.0
360,Afghanistan,2023-03-03,209362,3,7896,,191170,4,10296,45,...,1201744.0,29487.0,40754388.0,Asia,195.0,5161.0,34.0,0.07,,253.0
599,Afghanistan,2023-03-04,209370,4,7896,,191181,7,10293,45,...,1202018.0,29494.0,40754388.0,Asia,195.0,5161.0,34.0,0.1,,253.0
838,Afghanistan,2023-03-05,209390,20,7896,,191212,31,10282,45,...,1202290.0,29501.0,40754388.0,Asia,195.0,5161.0,34.0,0.5,,252.0
1077,Afghanistan,2023-03-06,209394,24,7896,,191233,52,10265,45,...,1202290.0,29501.0,40754388.0,Asia,195.0,5161.0,34.0,0.6,,252.0
1316,Afghanistan,2023-03-07,209415,21,7896,,191243,10,10276,45,...,1202868.0,29515.0,40754388.0,Asia,195.0,5161.0,34.0,0.5,,252.0
1555,Afghanistan,2023-03-08,209441,26,7896,,191262,19,10283,45,...,1203414.0,29528.0,40754388.0,Asia,195.0,5161.0,34.0,0.6,,252.0
1794,Afghanistan,2023-03-09,209451,10,7896,,191272,10,10283,45,...,1203807.0,29538.0,40754388.0,Asia,195.0,5161.0,34.0,0.2,,252.0
2033,Afghanistan,2023-03-10,209484,2,7896,,191284,4,10304,45,...,1204573.0,29557.0,40754388.0,Asia,195.0,5161.0,34.0,0.05,,253.0
5,Africa,2023-03-02,12795801,732,258590,,12076247,132,460964,548,...,,,,Africa,,,,,,


### 1.2 Are there any duplicate rows in the data?

In [12]:
new_df[new_df.duplicated()].shape

(0, 22)

- The data has no duplicate rows

### 1.3 What do the columns in the data mean?
- The columns in the data are the following:
    - **Country**: The country 
    - **Date**: The date 
    - **TotalCases**: The total cases 
    - **NewCases**: The new cases 
    - **TotalDeaths**: The total deaths 
    - **NewDeaths**: The new deaths
    - **TotalRecovered**: The total recovered
    - **NewRecovered**: The new recovered
    - **ActiveCases**: The active cases
    - **Serious,Critical**: The serious, critical
    - **TotalCases/1M pop**: The total cases per 1 million population
    - **Deaths/1M pop**: The total deaths per 1 million population
    - **TotalTests**: The total tests
    - **Tests/1M pop**: The total tests per 1 million population
    - **Population**: The population
    - **Continent**: The continent
    - **1 Caseevery X ppl**: The ratio for every X people. for example, 1 Caseevery X ppl is 3 means that 3 people have 1 case
    - **1 Deathevery X ppl**: The ratio deaths for every people.
    - **1 Testevery  X ppl**: The ratio tests for every people.
    - **New Cases/1M pop**: the new cases per 1 million population
    - **New Deaths/1M pop**: The new deaths per 1 million population
    - **Active Cases/1M pop**: The active cases per 1 million population

### 1.4 What data type does each column currently have? 

In [13]:
new_df.dtypes

Country,Other                  object
Date                   datetime64[ns]
TotalCases                     object
NewCases                       object
TotalDeaths                    object
NewDeaths                     float64
TotalRecovered                 object
NewRecovered                   object
ActiveCases                    object
Serious,Critical               object
Tot Cases/1M pop               object
Deaths/1M pop                  object
TotalTests                     object
Tests/\r\n1M pop               object
Population                     object
Continent                      object
1 Caseevery X ppl              object
1 Deathevery X ppl             object
1 Testevery X ppl             float64
New Cases/1M pop               object
New Deaths/1M pop             float64
Active Cases/1M pop            object
dtype: object

**Issues that need to be preprocessed:**
- Rename the columns to be more descriptive
- Handle missing values
- Change the data type of the columns to be numeric
- Handle values such as +19, +1 or have ',' in values.
- Delete columns that are dependent attributes. 

## 2. Preprocessing

### 2.1 Rename the columns to be more descriptive

In [14]:
# Rename the col    
new_df = new_df.rename(columns={'Country,Other': 'Country'})
new_df = new_df.rename(columns={'Tot\xa0Cases/1M pop': 'TotalCases/1M pop'})
new_df = new_df.rename(columns={'Tests/\r\n1M pop': 'Tests/1M pop'})

### 2.2 Handle missing values

In [15]:
new_df.isnull().sum()

Country                   9
Date                      0
TotalCases                0
NewCases               1555
TotalDeaths              54
NewDeaths              1846
TotalRecovered          189
NewRecovered           1599
ActiveCases             171
Serious,Critical        945
TotalCases/1M pop        81
Deaths/1M pop           135
TotalTests              234
Tests/1M pop            234
Population               90
Continent                27
1 Caseevery X ppl        90
1 Deathevery X ppl      144
1 Testevery X ppl       234
New Cases/1M pop       1616
New Deaths/1M pop      1902
Active Cases/1M pop     163
dtype: int64

#### 2.2.1 Delete rows with missing Country values and Continents values are null

In [16]:
# delete the rows with null country names
new_df = new_df.dropna(subset=['Country'])
new_df.shape

(2142, 22)

In [17]:
new_df[new_df['Continent'].isnull()]

Unnamed: 0,Country,Date,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",...,TotalTests,Tests/1M pop,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl,New Cases/1M pop,New Deaths/1M pop,Active Cases/1M pop
234,Diamond Princess,2023-03-02,712,,13,,699,,0,,...,,,,,,,,,,
473,Diamond Princess,2023-03-03,712,,13,,699,,0,,...,,,,,,,,,,
712,Diamond Princess,2023-03-04,712,,13,,699,,0,,...,,,,,,,,,,
951,Diamond Princess,2023-03-05,712,,13,,699,,0,,...,,,,,,,,,,
1190,Diamond Princess,2023-03-06,712,,13,,699,,0,,...,,,,,,,,,,
1429,Diamond Princess,2023-03-07,712,,13,,699,,0,,...,,,,,,,,,,
1668,Diamond Princess,2023-03-08,712,,13,,699,,0,,...,,,,,,,,,,
1907,Diamond Princess,2023-03-09,712,,13,,699,,0,,...,,,,,,,,,,
2146,Diamond Princess,2023-03-10,712,,13,,699,,0,,...,,,,,,,,,,
237,MS Zaandam,2023-03-02,9,,2,,7,,0,,...,,,,,,,,,,


In [18]:
# So need to delete 'Diamond Princess', 'MS Zaandam' country 
new_df = new_df[~new_df['Country'].str.contains('Diamond Princess')]
new_df = new_df[~new_df['Country'].str.contains('MS Zaandam')]
new_df.shape

(2124, 22)

In [19]:
new_df.isnull().sum()

Country                   0
Date                      0
TotalCases                0
NewCases               1528
TotalDeaths              54
NewDeaths              1819
TotalRecovered          189
NewRecovered           1572
ActiveCases             171
Serious,Critical        927
TotalCases/1M pop        54
Deaths/1M pop           108
TotalTests              207
Tests/1M pop            207
Population               63
Continent                 0
1 Caseevery X ppl        63
1 Deathevery X ppl      117
1 Testevery X ppl       207
New Cases/1M pop       1589
New Deaths/1M pop      1875
Active Cases/1M pop     136
dtype: int64

#### 2.2.2 Delete rows missing population values 

In [20]:
new_df[new_df['Population'].isnull()]

Unnamed: 0,Country,Date,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",...,TotalTests,Tests/1M pop,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl,New Cases/1M pop,New Deaths/1M pop,Active Cases/1M pop
5,Africa,2023-03-02,12795801,+732,258590,,12076247,+132,460964,548,...,,,,Africa,,,,,,
244,Africa,2023-03-03,12796326,+525,258590,,12076332,+85,461404,548,...,,,,Africa,,,,,,
483,Africa,2023-03-04,12796404,+78,258590,,12076449,+117,461365,548,...,,,,Africa,,,,,,
722,Africa,2023-03-05,12796571,+167,258593,3.0,12076616,+167,461362,548,...,,,,Africa,,,,,,
961,Africa,2023-03-06,12796571,+167,258593,3.0,12076616,+167,461362,548,...,,,,Africa,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963,World,2023-03-06,680724121,+62130,6805799,311.0,653529398,+71513,20388924,40663,...,,,,All,,,,,,
1202,World,2023-03-07,680788906,+64785,6806351,552.0,653647296,+117898,20335259,40671,...,,,,All,,,,,,
1441,World,2023-03-08,680980079,+102639,6808001,577.0,653826909,+178244,20345169,40532,...,,,,All,,,,,,
1680,World,2023-03-09,681100271,+120192,6808650,649.0,654004006,+177097,20287615,40463,...,,,,All,,,,,,


- Null population because there are rows that calculate the total number in each continent such as Asia, Europe, North America, etc, we can remove these rows.

In [21]:
# remove the rows with null population
new_df = new_df[~new_df['Population'].isnull()]
new_df.dtypes

Country                        object
Date                   datetime64[ns]
TotalCases                     object
NewCases                       object
TotalDeaths                    object
NewDeaths                     float64
TotalRecovered                 object
NewRecovered                   object
ActiveCases                    object
Serious,Critical               object
TotalCases/1M pop              object
Deaths/1M pop                  object
TotalTests                     object
Tests/1M pop                   object
Population                     object
Continent                      object
1 Caseevery X ppl              object
1 Deathevery X ppl             object
1 Testevery X ppl             float64
New Cases/1M pop               object
New Deaths/1M pop             float64
Active Cases/1M pop            object
dtype: object

#### 2.2.3. Delete dependent attributes such as TotalCases/1M pop, TotalDeaths/1M pop, etc. Because we can calculate these values from TotalCases, TotalDeaths and population. For example, TotalCases/1M pop = TotalCases / Population 

In [23]:
new_df = new_df.drop(columns=['TotalCases/1M pop', 'Tests/1M pop', 'Deaths/1M pop', 'New Cases/1M pop', 'New Deaths/1M pop'])
new_df.head(5)

Unnamed: 0,Country,Date,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",TotalTests,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl,Active Cases/1M pop
121,Afghanistan,2023-03-02,209359,19,7896,,191166,10,10297,45,1201475,40754388,Asia,195,5161,34.0,253
360,Afghanistan,2023-03-03,209362,3,7896,,191170,4,10296,45,1201744,40754388,Asia,195,5161,34.0,253
599,Afghanistan,2023-03-04,209370,4,7896,,191181,7,10293,45,1202018,40754388,Asia,195,5161,34.0,253
838,Afghanistan,2023-03-05,209390,20,7896,,191212,31,10282,45,1202290,40754388,Asia,195,5161,34.0,252
1077,Afghanistan,2023-03-06,209394,24,7896,,191233,52,10265,45,1202290,40754388,Asia,195,5161,34.0,252


In [24]:
new_df.isnull().sum()

Country                   0
Date                      0
TotalCases                0
NewCases               1526
TotalDeaths              54
NewDeaths              1809
TotalRecovered          189
NewRecovered           1568
ActiveCases             171
Serious,Critical        927
TotalTests              144
Population                0
Continent                 0
1 Caseevery X ppl         0
1 Deathevery X ppl       54
1 Testevery X ppl       144
Active Cases/1M pop      73
dtype: int64

### 2.3 Handle values such as +19, +1 or have ',' in values.

In [None]:
# remove all , in values
new_df['TotalCases'] = new_df['TotalCases'].str.replace(',', '')
new_df['NewCases'] = new_df['NewCases'].str.replace(r'[+,]', '')
new_df['TotalDeaths'] = new_df['TotalDeaths'].str.replace(',', '')
new_df['TotalRecovered'] = new_df['TotalRecovered'].str.replace(',', '')
new_df['NewRecovered'] = new_df['NewRecovered'].str.replace(r'[+,]', '')
new_df['ActiveCases'] = new_df['ActiveCases'].str.replace(',', '')
new_df['Serious,Critical'] = new_df['Serious,Critical'].str.replace(',', '')
new_df['TotalTests'] = new_df['TotalTests'].str.replace(',', '')
new_df['Population'] = new_df['Population'].str.replace(',', '')
new_df['Active Cases/1M pop'] = new_df['Active Cases/1M pop'].str.replace(',', '')

  new_df['NewCases'] = new_df['NewCases'].str.replace(r'[+,]', '')
  new_df['NewRecovered'] = new_df['NewRecovered'].str.replace(r'[+,]', '')


In [None]:
# fill the missing values with 0 
new_df[['NewCases', 'NewDeaths', 'NewRecovered','Serious,Critical', 'Active Cases/1M pop']] = new_df[['NewCases', 'NewDeaths','NewRecovered','Serious,Critical', 'Active Cases/1M pop']].fillna(0)
new_df.isnull().sum()

Country                  0
Date                     0
TotalCases               0
NewCases                 0
TotalDeaths             54
NewDeaths                0
TotalRecovered         189
NewRecovered             0
ActiveCases            171
Serious,Critical         0
TotalTests             144
Population               0
Continent                0
1 Caseevery X ppl        0
1 Deathevery X ppl      54
1 Testevery X ppl      144
Active Cases/1M pop      0
dtype: int64

In [None]:
# change the data type of columns to numeric
new_df['ActiveCases'] = pd.to_numeric(new_df['ActiveCases'])
new_df['Active Cases/1M pop'] = pd.to_numeric(new_df['Active Cases/1M pop'])
new_df['Population'] = pd.to_numeric(new_df['Population'])
new_df['TotalCases'] = pd.to_numeric(new_df['TotalCases'])
new_df['TotalDeaths'] = pd.to_numeric(new_df['TotalDeaths'])
new_df['TotalRecovered'] = pd.to_numeric(new_df['TotalRecovered'])
new_df['NewCases'] = pd.to_numeric(new_df['NewCases'])
new_df['NewDeaths'] = pd.to_numeric(new_df['NewDeaths'])
new_df['NewRecovered'] = pd.to_numeric(new_df['NewRecovered'])
new_df['Serious,Critical'] = pd.to_numeric(new_df['Serious,Critical'])
new_df['TotalTests'] = pd.to_numeric(new_df['TotalTests'])
    
new_df.dtypes

Country                        object
Date                   datetime64[ns]
TotalCases                      int64
NewCases                        int64
TotalDeaths                   float64
NewDeaths                     float64
TotalRecovered                float64
NewRecovered                    int64
ActiveCases                   float64
Serious,Critical                int64
TotalTests                    float64
Population                      int64
Continent                      object
1 Caseevery X ppl              object
1 Deathevery X ppl             object
1 Testevery X ppl             float64
Active Cases/1M pop           float64
dtype: object

#### 2.3.0 Delete rows with outliers such as population < 1000000 or total deaths could not be updated for a long time.

In [None]:
# we can remove the rows with null values in TotalDeaths and population < 1000000
new_df = new_df[~new_df['TotalDeaths'].isnull()]
new_df = new_df[~(new_df['Population'] < 1000000)]

#### 2.3.1. Fill in missing ActiveCase with formula: **ActiveCase = ActiveCase/1M pop * Population / 1000000**

In [None]:
new_df['ActiveCases'] = new_df['ActiveCases'].fillna(round(new_df['Active Cases/1M pop'] * new_df['Population'] / 1000000, 0))

#### 2.3.2 Fill missing TotalRecovered with formula: **TotalCases = TotalRecovered + ActiveCases + TotalDeaths**

In [None]:
new_df['TotalRecovered'] = new_df['TotalRecovered'].fillna(round(new_df['TotalCases'] - new_df['TotalDeaths'] - new_df['ActiveCases'], 0))

#### 2.3.3 Fill missing values of 1 test every X ppl by mean of 1 test every X ppl in the same continent

In [None]:
# fill the missing values with the mean of each continent
new_df['1 Testevery X ppl'] = new_df.groupby('Continent')['1 Testevery X ppl'].transform(lambda x: x.fillna(round(x.mean(),0)))

#### 2.3.4 Fill missing values of TotalTests by formula: **TotalTests = Population / 1 Testevery X ppl**

In [None]:
# replace all missing values in TotalTests with division of population and 1 Testevery X ppl
new_df['TotalTests'] = new_df['TotalTests'].fillna(round(new_df['Population'] / new_df['1 Testevery X ppl'], 0))

In [None]:
new_df.isnull().sum()

Country                0
Date                   0
TotalCases             0
NewCases               0
TotalDeaths            0
NewDeaths              0
TotalRecovered         0
NewRecovered           0
ActiveCases            0
Serious,Critical       0
TotalTests             0
Population             0
Continent              0
1 Caseevery X ppl      0
1 Deathevery X ppl     0
1 Testevery X ppl      0
Active Cases/1M pop    0
dtype: int64

### 2.5 Delete columns that are not necessary such as dependent attributes.

- There are 4 columns that can calculate from another columns such as 1 Caseevery X ppl, 1 Deathevery X ppl, 1 Testevery  X ppl, Active Cases/1M pop. We can delete these columns.

In [None]:
# delete 4 last columns
new_df = new_df.drop(columns=['1 Testevery X ppl', '1 Deathevery X ppl', '1 Testevery X ppl','Active Cases/1M pop'])
new_df.head(10)

Unnamed: 0,Country,Date,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",TotalTests,Population,Continent,1 Caseevery X ppl
121,Afghanistan,2023-03-02,209359,19,7896.0,0.0,191166.0,10,10297.0,45,1201475.0,40754388,Asia,195
360,Afghanistan,2023-03-03,209362,3,7896.0,0.0,191170.0,4,10296.0,45,1201744.0,40754388,Asia,195
599,Afghanistan,2023-03-04,209370,4,7896.0,0.0,191181.0,7,10293.0,45,1202018.0,40754388,Asia,195
838,Afghanistan,2023-03-05,209390,20,7896.0,0.0,191212.0,31,10282.0,45,1202290.0,40754388,Asia,195
1077,Afghanistan,2023-03-06,209394,24,7896.0,0.0,191233.0,52,10265.0,45,1202290.0,40754388,Asia,195
1316,Afghanistan,2023-03-07,209415,21,7896.0,0.0,191243.0,10,10276.0,45,1202868.0,40754388,Asia,195
1555,Afghanistan,2023-03-08,209441,26,7896.0,0.0,191262.0,19,10283.0,45,1203414.0,40754388,Asia,195
1794,Afghanistan,2023-03-09,209451,10,7896.0,0.0,191272.0,10,10283.0,45,1203807.0,40754388,Asia,195
2033,Afghanistan,2023-03-10,209484,2,7896.0,0.0,191284.0,4,10304.0,45,1204573.0,40754388,Asia,195
109,Albania,2023-03-02,334408,0,3596.0,0.0,329152.0,0,1660.0,0,1941032.0,2866374,Europe,9


### 2.7 Change the country code to country name

In [None]:
#change CAR to Central African Republic
new_df.loc[new_df['Country'] == 'CAR', 'Country'] = 'Central African Republic'
#chang DPRK to North Korea
new_df.loc[new_df['Country'] == 'DPRK', 'Country'] = 'North Korea'
#change DRC to Democratic Republic of the Congo
new_df.loc[new_df['Country'] == 'DRC', 'Country'] = 'Democratic Republic of the Congo'
#change S. Korea to South Korea
new_df.loc[new_df['Country'] == 'S. Korea', 'Country'] = 'South Korea'
#change TL to Timor-Leste
new_df.loc[new_df['Country'] == 'TL', 'Country'] = 'Timor-Leste'
#change UAE to United Arab Emirates
new_df.loc[new_df['Country'] == 'UAE', 'Country'] = 'United Arab Emirates'
#change UK to United Kingdom
new_df.loc[new_df['Country'] == 'UK', 'Country'] = 'United Kingdom'

### 2.6 Add longitude and latitude columns

In [None]:
country = ""
response = []
for index, row in new_df.iterrows():
    newcountry = row['Country']
    if newcountry == country:
        lat = response[0]['lat']
        lon = response[0]['lon']
        # add the lat and lon to the dataframe
        new_df.loc[index, 'lat'] = lat
        new_df.loc[index, 'lon'] = lon
        continue
    country = newcountry
    # get the latitude and longitude of each country
    url = "https://nominatim.openstreetmap.org/?addressdetails=1&q=" + country +"&format=json&limit=1"
    response = requests.get(url).json()
    lat = response[0]['lat']
    lon = response[0]['lon']
    # add the lat and lon to the dataframe
    new_df.loc[index, 'lat'] = lat
    new_df.loc[index, 'lon'] = lon

new_df.head(10)

Unnamed: 0,Country,Date,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",TotalTests,Population,Continent,1 Caseevery X ppl,lat,lon
121,Afghanistan,2023-03-02,209359,19,7896.0,0.0,191166.0,10,10297.0,45,1201475.0,40754388,Asia,195,33.7680065,66.2385139
360,Afghanistan,2023-03-03,209362,3,7896.0,0.0,191170.0,4,10296.0,45,1201744.0,40754388,Asia,195,33.7680065,66.2385139
599,Afghanistan,2023-03-04,209370,4,7896.0,0.0,191181.0,7,10293.0,45,1202018.0,40754388,Asia,195,33.7680065,66.2385139
838,Afghanistan,2023-03-05,209390,20,7896.0,0.0,191212.0,31,10282.0,45,1202290.0,40754388,Asia,195,33.7680065,66.2385139
1077,Afghanistan,2023-03-06,209394,24,7896.0,0.0,191233.0,52,10265.0,45,1202290.0,40754388,Asia,195,33.7680065,66.2385139
1316,Afghanistan,2023-03-07,209415,21,7896.0,0.0,191243.0,10,10276.0,45,1202868.0,40754388,Asia,195,33.7680065,66.2385139
1555,Afghanistan,2023-03-08,209441,26,7896.0,0.0,191262.0,19,10283.0,45,1203414.0,40754388,Asia,195,33.7680065,66.2385139
1794,Afghanistan,2023-03-09,209451,10,7896.0,0.0,191272.0,10,10283.0,45,1203807.0,40754388,Asia,195,33.7680065,66.2385139
2033,Afghanistan,2023-03-10,209484,2,7896.0,0.0,191284.0,4,10304.0,45,1204573.0,40754388,Asia,195,33.7680065,66.2385139
109,Albania,2023-03-02,334408,0,3596.0,0.0,329152.0,0,1660.0,0,1941032.0,2866374,Europe,9,41.000028,19.9999619


In [None]:
# shape of new dataframe after cleaning
new_df.shape

(1422, 16)

In [None]:
# save to csv
new_df.to_csv('../data/final_data.csv', index=False)