# Data Cleaning

Data = records of tests carried out on 9th November 2020 (13,762 entries after cleaning)

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('israeli_ministry_of_health.csv')
df 

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication,Unnamed: 10
0,12/11/2020,0,0,0,0,0,negative,No,male,Other,
1,12/11/2020,0,1,0,0,0,negative,No,male,Other,
2,12/11/2020,0,0,0,0,0,negative,Yes,female,Other,
3,12/11/2020,0,0,0,0,0,negative,No,male,Other,
4,12/11/2020,0,1,0,0,0,negative,No,male,Contact with confirmed,
...,...,...,...,...,...,...,...,...,...,...,...
1048570,10/09/2020,0,0,0,0,0,negative,No,female,Other,
1048571,10/09/2020,0,0,0,0,0,negative,No,female,Other,
1048572,10/09/2020,0,0,0,0,0,negative,No,male,Other,
1048573,10/09/2020,0,0,0,0,0,negative,No,female,Other,


In [3]:
df = df.loc[(df['test_date'] == '09/11/2020')] # Filter to tests carried out on 9th November 2020
df.columns = ['Test Date', 'Cough', 'Fever', 'Sore Throat', 'Shortness of Breath', 'Headache', 'test_result', 'Over 60', 'Gender', 'Contact with Infectious', 'Delete'] # Rename columns
del df['Test Date'] # Delete the 'test_date' column, as it is now redundant
del df['Over 60']
del df['Gender']
del df['Delete']

In [4]:
null_values = df.isnull().sum()

In [5]:
null_values # No null values

Cough                      0
Fever                      0
Sore Throat                0
Shortness of Breath        0
Headache                   0
test_result                0
Contact with Infectious    0
dtype: int64

In [6]:
df['Cough'].unique() # Only 0 and 1

array([0, 1])

In [7]:
df['Fever'].unique() # Only 0 and 1

array([0, 1])

In [8]:
df['Sore Throat'].unique() # Only 0 and 1

array([0, 1])

In [9]:
df['Shortness of Breath'].unique() # Only 0 and 1

array([0, 1])

In [10]:
df['Headache'].unique() # Only 0 and 1

array([0, 1])

In [11]:
df['test_result'].unique() # Negative, positive and 'other'...get rid of 'other'

array(['negative', 'positive', 'other'], dtype=object)

In [12]:
df.test_result.value_counts()

negative    13381
positive      381
other         119
Name: test_result, dtype: int64

In [13]:
df = df[df.test_result != 'other'] # Get rid of records where it says 'other'

In [14]:
df['test_result'].unique() # 'Other' has now been removed

array(['negative', 'positive'], dtype=object)

In [15]:
df.columns = [ 'Cough', 'Fever', 'Sore Throat', 'Shortness of Breath', 'Headache', 'Test Result', 'Contact with Infectious']

In [16]:
df['Contact with Infectious'].unique() # Fine, except we need to change the names later on

array(['Other', 'Contact with confirmed', 'Abroad'], dtype=object)

In [17]:
binary_mapping = {1: 'Yes', 0: 'No'}
result_mapping = {'positive': 'Positive', 'negative': 'Negative'}
infectious_mapping = {'Contact with confirmed': 'Yes', 'Other': 'No', 'Abroad':'No'}

# Proper labels are assigned for visualisation so the labels make sense. Then convert all the values to numerical
# ones before ML. 

In [18]:
df = df.replace(binary_mapping)

In [19]:
df = df.replace(result_mapping)

In [20]:
df = df.replace(infectious_mapping)

In [21]:
df

Unnamed: 0,Cough,Fever,Sore Throat,Shortness of Breath,Headache,Test Result,Contact with Infectious
43720,No,No,No,No,No,Negative,No
43721,No,No,No,No,No,Positive,No
43722,No,No,No,No,No,Negative,No
43723,No,No,No,No,No,Positive,No
43724,No,No,No,No,No,Positive,Yes
...,...,...,...,...,...,...,...
57596,No,No,No,No,No,Negative,No
57597,No,No,No,No,No,Negative,No
57598,No,No,No,No,No,Negative,No
57599,No,No,No,No,No,Negative,No


In [22]:
df.to_csv('FinalData.csv')