In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
deaths_filepath = os.path.join('.', 'covid19-data', 'Deaths', 'OpenData_Slovakia_Covid_Deaths_AgeGroup_District.csv')

In [3]:
# encoding for Windows-1252
deaths_df = pd.read_csv(deaths_filepath, delimiter=';', encoding='cp1252')

date_splitter = '.'
deaths_df['Day'] = deaths_df['Date'].apply(lambda datum: int(datum.split(date_splitter)[0]))
deaths_df['Month'] = deaths_df['Date'].apply(lambda datum: int(datum.split(date_splitter)[1]))
deaths_df['Year'] = deaths_df['Date'].apply(lambda datum: int(datum.split(date_splitter)[-1]))

deaths_df['Region'] = deaths_df['Region'].apply(lambda x: x if x != 'Tren?iansky' else 'Trenčiansky')

# remove nan (type(nan) is float)
districts = list(filter(lambda reg: type(reg) is str, deaths_df["District"].unique()))
invalid_districts = list(filter(lambda dist: '?' in dist, districts))

In [5]:
#print(invalid_districts)
fixed_districts = ['Trenčín', 'Lučenec', 'Čadca', 'Piešňany', 'Vranov nad Topľou', 'Stará Ľubovňa', 'Topoľčany', 'Rožňava', 'Veľký Krtíš', 'Levoča', 'Bytča', 'Šaľa']

deaths_df['District'] = deaths_df['District'].apply(lambda dist: fixed_districts[invalid_districts.index(dist)] if dist in invalid_districts else dist)

deaths_df.head()
#deaths_df.info()

Unnamed: 0,Date,Gender,District,AgeGroup,Type,Region,Day,Month,Year
0,30.3.2020,M,Prievidza,60,Doma,Tren?iansky,30,3,2020
1,4.4.2020,M,Pezinok,55,DSS,Bratislavský,4,4,2020
2,6.4.2020,F,Bratislava,65,Nemocnica,Bratislavský,6,4,2020
3,14.4.2020,M,Trnava,85,Nemocnica,Trnavský,14,4,2020
4,15.4.2020,M,Bratislava,60,Nemocnica,Bratislavský,15,4,2020


In [6]:
print(deaths_df['Gender'].unique())
print(deaths_df['District'].unique())
print(deaths_df['AgeGroup'].unique())
print(deaths_df['Type'].unique())
print(deaths_df['Region'].unique())
print(deaths_df['Type'].isnull().sum())
print(deaths_df['District'].isnull().sum())
print(deaths_df['Region'].isnull().sum())

['M' 'F']
['Prievidza' 'Pezinok' 'Bratislava' 'Trnava' 'Prešov' 'Poprad' 'Martin'
 'Trenčín' 'Dunajská Streda' 'Košice' 'Ružomberok' 'Nitra' 'Michalovce'
 'Lučenec' 'Galanta' 'Nové Zámky' 'Žiar nad Hronom' nan 'Partizánske'
 'Humenné' 'Čadca' 'Kežmarok' 'Komárno' 'Sobrance' 'Brezno' 'Skalica'
 'Bardejov' 'Žarnovica' 'Banská Bystrica' 'Piešňany' 'Trebišov' 'Zvolen'
 'Vranov nad Topľou' 'Dolný Kubín' 'Liptovský Mikuláš' 'Levice' 'Svidník'
 'Námestovo' 'Tvrdošín' 'Stará Ľubovňa' 'Medzilaborce' 'Rimavská Sobota'
 'Považská Bystrica' 'Topoľčany' 'Žilina' 'Myjava' 'Snina' 'Rožňava'
 'Veľký Krtíš' 'Levoča' 'Spišská Nová Ves' 'Bánovce nad Bebravou' 'Senica'
 'Detva' 'Poltár' 'Sabinov' 'Kysucké Nové Mesto' 'Zlaté Moravce' 'Revúca'
 'Hlohovec' 'Nové Mesto n.Váhom' 'Banská Štiavnica' 'Bytča' 'Malacky'
 'Ilava' 'Púchov' 'Krupina' 'Senec' 'Stropkov' 'Gelnica' 'Šaľa']
[ 60  55  65  85  80  90  95  70  75  15  50  30  45  40  25  35 100  20
   0  10]
['Doma' 'DSS' 'Nemocnica' 'Sanitka' nan]
['Tren?ia

In [8]:
deaths_df['Month'].unique()

array([ 3,  4,  5,  7,  8,  9, 10, 11, 12,  1,  2,  6], dtype=int64)

In [10]:
final_death_df = deaths_df.drop(['Date'], axis='columns')

In [11]:
final_death_df.head()

Unnamed: 0,Gender,District,AgeGroup,Type,Region,Day,Month,Year
0,M,Prievidza,60,Doma,Tren?iansky,30,3,2020
1,M,Pezinok,55,DSS,Bratislavský,4,4,2020
2,F,Bratislava,65,Nemocnica,Bratislavský,6,4,2020
3,M,Trnava,85,Nemocnica,Trnavský,14,4,2020
4,M,Bratislava,60,Nemocnica,Bratislavský,15,4,2020


In [12]:
final_death_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14434 entries, 0 to 14433
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Gender    14434 non-null  object
 1   District  14350 non-null  object
 2   AgeGroup  14434 non-null  int64 
 3   Type      14388 non-null  object
 4   Region    14350 non-null  object
 5   Day       14434 non-null  int64 
 6   Month     14434 non-null  int64 
 7   Year      14434 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 902.2+ KB
