# Exercise 1 - Train a Classifier

## Task 1 - Let's observe and prepare the data

In [None]:
#DEPENDENCIES----------------------------------------
import subprocess
import sys
import os
import pip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import sklearn

def installFromNotebook(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

def upgradeFromNotebook(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", package])

#Verification that the SDK is up to date
upgradeFromNotebook("azureml-sdk")
import azureml.core
from azureml.core import Dataset
from azureml.core import Workspace
ws = Workspace.from_config()

#Installing other dependencies
installFromNotebook("plotly")
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
plt.style.use("classic")

#Create path to have outputs folder and outputs/models to store our models
try:
    os.makedirs(r'./outputs/models/')
    os.makedirs(r'./sources/')
except:
     pass

#Check on version and Workspace name
print("Libs version control --", "Azure ML SDK Version:", azureml.core.VERSION, "Pandas:", pd.__version__, "pip:", pip.__version__,
"seaborn:", sns.__version__, 'Scikit-kearn:', sklearn.__version__, 'Workspace:', ws.name)
#----------------------------------------------------

# Dataset overview: Covid-19 Case Surveillance Public Use Dataset

## General considerations

https://data.cdc.gov/Case-Surveillance/COVID-19-Case-Surveillance-Public-Use-Data/vbim-akqf

The COVID-19 case surveillance system database includes individual-level data reported to U.S. states and autonomous reporting entities, including New York City and the District of Columbia (D.C.), as well as U.S. territories and states. On April 5, 2020, COVID-19 was added to the Nationally Notifiable Condition List and classified as “immediately notifiable, urgent (within 24 hours)” by a Council of State and Territorial Epidemiologists (CSTE) Interim Position Statement (Interim-20-ID-01). CSTE updated the position statement on August 5, 2020 to clarify the interpretation of antigen detection tests and serologic test results within the case classification. The statement also recommended that all states and territories enact laws to make COVID-19 reportable in their jurisdiction, and that jurisdictions conducting surveillance should submit case notifications to CDC. COVID-19 case surveillance data are collected by jurisdictions and shared voluntarily with CDC.

For more information: wwwn.cdc.gov/nndss/conditions/coronavirus-disease-2019-covid-19/case-definition/2020/08/05/.

The dataset contains 13.4 million rows of deidentified patient data (more than 3 Go). To run this lab smoothly, we will use an extract of 100.000 data point.

## Data points description

| Variable        | Description      | Source        | Values        | Type	        | Calculation (if applicable)      |
| ------|-----|-----|-----|-----|-----|
| **cdc_report_dt**  	| Date case was first reported to the CDC 	| Calculated 	| YYYY-MM-DD 	| Date 	| Deprecated; CDC recommends researchers use cdc_case_earliest_dt in time series and other analyses. This date was populated using the date at which a case record was first submitted to the database. If missing, then the report date entered on the case report form was used. If missing, then the date at which the case first appeared in the database was used. If none available, then left blank.  	|
| **cdc_case_earliest_dt**  	| The earlier of the Clinical Date (date related to the illness or specimen collection) or the Date Received by CDC 	| Calculated 	| YYYY-MM-DD 	| Date 	| Cdc_case_earliest_dt uses the best available date from the set of dates related to illness/specimen collection and the set of dates related to when a case is reported. It is an option to end-users who need a date variable with optimized completeness. The logic of cdc_case_earliest_dt is to use the non-null date of one variable when the other is null and to use the earliest valid date when both dates are available.  If no date available, then left blank. 	|
| **pos_spec_dt**  	| Date of first positive specimen collection 	| Case Report Form 	| YYYY-MM-DD 	| Date 	|  	|
| **onset_dt**  	| Date of symptom onset 	| Case Report Form 	| YYYY-MM-DD 	| Date 	|  	|
| **current_status**  	| What is the current status of this person? 	| Case Report Form 	| Laboratory-confirmed case Probable case 	| String 	| Please see latest CSTE case definition for more information. 	|
| **sex**  	| Gender 	| Case Report Form 	| [Male - Female - Unknown - Other - Missing - NA] 	| String 	|  	|
| **age_group**  	| Age group categories 	| Calculated 	| [0 - 9 Years - 10 - 19 Years - 20 - 39 Years - 40 - 49 Years - 50 - 59 Years - 60 - 69 Years - 70 - 79 Years - 80 + Years - Missing - NA] 	| String 	| The age group categorizations were populated using the age value that was reported on the case report form. Date of birth was used to fill in missing/unknown age values using the difference in time between date of birth and onset date. 	|
| **race_ethnicity_combined**  	| Race and Ethnicity (combined) 	| Calculated 	| [American Indian/Alaska Native, Non-Hispanic - Asian, Non-Hispanic - Black, Non-Hispanic - Multiple/Other, Non-Hispanic - Native Hawaiian/Other Pacific Islander, Non-Hispanic - White, Non-Hispanic - Hispanic/Latino - Unknown - Missing - NA] 	| String 	| If more than race was reported, race was categorized into multiple/other races. 	|
| **hosp_yn**  	| Was the patient hospitalized? 	| Case Report Form 	| [Yes - No - Unknown - Missing] 	| Character 	|  	|
| **icu_yn**  	| Was the patient admitted to an intensive care unit (ICU)? 	| Case Report Form 	| [Yes - No - Unknown - Missing] 	| Character 	|  	|
| **death_yn**  	| Did the patient die as a result of this illness? 	| Case Report Form 	| [Yes - No - Unknown - Missing] 	| Character 	|  	|
| **medcond_yn**  	| Pre-existing medical conditions? 	| Case Report Form 	| [Yes - No - Unknown - Missing] 	| Character 	|  	|

In [None]:
#LOAD DATA AND HAVE A FIRST OVERVIEW-----------------
#Get a fraction of the dataset (to observe the first rows rapidly)
Covid19_CSPUD_JSON = requests.get('https://data.cdc.gov/resource/vbim-akqf.json')
data = pd.read_json(Covid19_CSPUD_JSON.text)
#Observe top 5 lines
data.head(5)
#----------------------------------------------------

In [None]:
#DATA GENERAL DESCRIPTION-----------------
#Get a bigger version of the dataset from GitHub (must be 'raw' version of the file and on a public repo)
#url = 'https://raw.githubusercontent.com/solliancenet/azure-ml-data-science-400/blob/master/day-02/data/COVID-19_Case_Surveillance_Public_Use_Data_shuffled_100000.csv'
url = 'https://raw.githubusercontent.com/pauldenoyes/tool-kit/master/COVID-19_Case_Surveillance_Public_Use_Data_shuffled_100000.csv'
data = pd.read_csv(url, error_bad_lines=False)

#View columns where many data is missing
print('>> Null count by variable \r\n')
print(data.isnull().sum())

#In the data description, we have learned that 'cdc_report_dt' is deprecated
#and 'pos_spec_dt' and 'onset_dt' are way too incomplete, in regard to a total
#of 100.000 rows, to be used. So we remove those columns.
data = data.drop(['cdc_report_dt', 'pos_spec_dt', 'onset_dt'], axis=1)

#Now let's have a summarized look at our data
print('\r\n>> Data general description \r\n')
print(data.describe().transpose())
#----------------------------------------------------

In [None]:
#OBSERVE DATA----------------------------------------

##CONFIRMED CASES
values = data['current_status'].value_counts().tolist()
#Print unique status values
print('\r\n\t', '>> Unique cases status values:', data['current_status'].unique())
#Use short names for those values for display purposes
names = ['Confirmed', 'Probable']

#Make pie chart
fig = px.pie(
    names=names,
    values=values,
    title="Case Status Pie Chart",
    color_discrete_sequence=px.colors.sequential.RdBu,
)
#Display the pie chart
fig.show()

##GENDER DISTRIBUTION
values = data['sex'].value_counts().tolist()
print('\r\n\t', '>> Unique cases gender values:', data['sex'].unique())
names = ['Female', 'Male', 'Unknown', 'Missing', 'Other']
fig = px.pie(
    names=names,
    values=values,
    title="Gender Status Distribution",
    color_discrete_sequence=px.colors.sequential.Bluyl_r,
)
fig.show()

##AGE GROUP DISTRIBUTION
values = data['age_group'].value_counts().tolist()
print('\r\n\t', '>> Age beans:', data['age_group'].unique())
names = ['20 - 29 Years', '30 - 39 Years', '40 - 49 Years', '50 - 59 Years', '60 - 69 Years', '10 - 19 Years', '70 - 79 Years', '80+ Years', '0 - 9 Years', 'Unknown']
fig = px.bar(
    x=names,
    y=values,
    title="Age Group Distribution",
    labels={
        'x': 'Age Group',
        'y': 'Number of Patients'
    },
    color=values
)
fig.show()

#ETHNICITY
values = data['race_ethnicity_combined'].value_counts().tolist()
print('\r\n\t', '>> Race and ethnicity (combined):', data['race_ethnicity_combined'].unique())
names = ['Unkown', 'White, Non-Hispanic', 'Hispanic/Latino', 'Black, Non-Hispanic', 'Missing', 'Multiple/Other, Non-Hispanic', 'Asian, Non-Hispanic', 'American Indian/Alaska Native, Non-Hispanic', 'Native Hawaiian/Other Pacific Islancer, Non-Hispanic']
fig = px.pie(
    names=names,
    values=values,
    title="Distribution of Races and Ethinicities ",
    color_discrete_sequence=px.colors.sequential.Electric,
)
fig.show()

#DEATH STATUS BY GENDER
plt.figure(figsize=(9, 7))
plt.style.use("fivethirtyeight")
sns.countplot(y="death_yn", hue ='sex', data=data[data['sex'].isin(['Male', 'Female', 'Other'])])
plt.xlabel("Count")
plt.ylabel("Death Status")
plt.title('Death Status by Gender')
plt.show()
#----------------------------------------------------

In [None]:
#ENCODE DATA FOR THE MACHINE (Categorical to numeric)
from sklearn.preprocessing import LabelEncoder
lb_encoder = LabelEncoder()
data['current_status'] = lb_encoder.fit_transform(data['current_status'])
data['age_group'] = lb_encoder.fit_transform(data['age_group'])
data['race_ethnicity_combined'] = lb_encoder.fit_transform(data['race_ethnicity_combined'])
data['sex'] = lb_encoder.fit_transform(data['sex'])
data['hosp_yn'] = lb_encoder.fit_transform(data['hosp_yn'])
data['icu_yn'] = lb_encoder.fit_transform(data['icu_yn'])
data['death_yn'] = lb_encoder.fit_transform(data['death_yn'])
data['medcond_yn'] = lb_encoder.fit_transform(data['medcond_yn'])
#Store exploitable data variable to use it in other NoteBooks
%store data
data.head(5)
#----------------------------------------------------