In [None]:
import pandas as pd
from helper1 import *


EDA Process for Case dataset

In [None]:
df = get_data_frame()
print("---- Dataset -> cases_train.csv --------------------")
col_names, col_na = print_num_of_missing_vals(df)


In [None]:
plot_bargraph('Missing Values (cases_train)', 'Attributes', 'Total percentage of values missing', col_names, col_na)

In [None]:
# plot countries v/s outcome for top 5 countries
top_5_countries = df['country'].value_counts().nlargest(5).index
country_df = df[df['country'].isin(top_5_countries)]

In [None]:
plot_countplot(country_df, 'Top 5 Countries_vs_Outcome', 'Countries', 'Outcome', x_attribute='country',
               hue='outcome')

In [None]:
plot_countplot(country_df, 'Top 5 Countries Frequency wise', 'Countries', 'Frequency', x_attribute='country')

In [None]:
# Plot Sex
plot_countplot(country_df, 'Sex vs Outcome (cases_train)', 'Sex', 'Outcome', x_attribute='sex', hue='outcome')

In [None]:
# combination of longitude and latitude
plot_scatterplot(df=df, title='Longitude and Latitude (cases_train)', x_label='Longitude', y_label='Latitude',
                 column_x='longitude', column_y='latitude')

In [None]:
# Top 5 countries's top 3 provinces
top_provs = []
for c in top_5_countries:
    temp_df = country_df[country_df['country'] == c]
    top_states = temp_df['province'].value_counts().nlargest(5).index
    top_provs.extend(top_states)
top_provs_df = country_df[country_df['province'].isin(top_provs)]
plot_countplot(top_provs_df, title='Top 3 Provinces in Top 5 countries (cases_train)',
               x_label='Country and Provinces', y_label='Count', x_attribute='country', hue='province',
               hue_order=top_provs, class_order=top_5_countries)

In [None]:
# plot age frequency
isDigit_age_df = df[df['age'].notna()]
isDigit_age_df = isDigit_age_df.loc[isDigit_age_df['age'].str.isdigit()]
isDigit_age_df = isDigit_age_df.sort_values(by='age')
plot_countplot(df=isDigit_age_df, title='Age Frequency (cases_train)', x_label='Age', y_label='Frequency',
               x_attribute='age', width=25, class_order=isDigit_age_df['age'])

In [None]:
# plot month frequency
df['date_confirmation'] = pd.to_datetime(df['date_confirmation'], errors='coerce')
df = df[df['date_confirmation'].notna()]
df_f = df.loc[df['date_confirmation'].dt.year.between(2020, 2020)]
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'dec']
month_counts = []
for month in range(1, 13):
    count = len(df.loc[df['date_confirmation'].dt.month.between(month, month)])
    month_counts.append(count)
plot_bargraph(title='Month Frequency (cases_train)', x_label='Month', y_label='Frequency', x_attribute=month_names,
              y_attribute=month_counts)

EDA Process for Location dataset

In [None]:
location_df = get_data_frame('location')
print("---- Dataset -> location.csv --------------------")
col_names, col_na = print_num_of_missing_vals(location_df)

In [None]:
# Attribute missing values
plot_bargraph('Missing Values (location)', 'Attributes', 'Total percentage of values missing', col_names, col_na)

In [None]:
# combination of longitude and latitude
plot_scatterplot(df=location_df, title='Longitude and Latitude (location)', x_label='Longitude', y_label='Latitude',
                 column_x='Long_', column_y='Lat')

In [None]:
# Top 10 countries with max confirmed cases
top_10_countries_confirmed = location_df.groupby(['Country_Region'])['Confirmed'].sum().sort_values(
    ascending=False).nlargest(10)
plot_bargraph(title='Top 10 Confirmed cases countries (location)', x_label='Country',
              y_label='# of Confirmed cases', x_attribute=top_10_countries_confirmed.index,
              y_attribute=top_10_countries_confirmed.values)

In [None]:
# Top 10 countries with max Deaths
top_10_countries_confirmed = location_df.groupby(['Country_Region'])['Deaths'].sum().sort_values(ascending=False).nlargest(
    10)
plot_bargraph(title='Top 10 Deaths cases countries (location)', x_label='Country', y_label='# of Deaths',
              x_attribute=top_10_countries_confirmed.index, y_attribute=top_10_countries_confirmed.values)

In [None]:
# Top 10 countries with max Recovered cases
top_10_countries_confirmed = location_df.groupby(['Country_Region'])['Recovered'].sum().sort_values(
    ascending=False).nlargest(10)
plot_bargraph(title='Top 10 Recovered cases countries (location)', x_label='Country',
              y_label='# of Recovered cases', x_attribute=top_10_countries_confirmed.index,
              y_attribute=top_10_countries_confirmed.values)

In [None]:
# Top 10 countries with max Active cases
top_10_countries_confirmed = location_df.groupby(['Country_Region'])['Active'].sum().sort_values(ascending=False).nlargest(
    10)
plot_bargraph(title='Top 10 Active cases countries (location)', x_label='Country', y_label='# of Active cases',
              x_attribute=top_10_countries_confirmed.index, y_attribute=top_10_countries_confirmed.values)


In [None]:
# Top 10 frequent countries
top_10_countries = location_df['Country_Region'].value_counts().nlargest(10)
plot_bargraph(title='Top 10 Most Frequent countries (location)', x_label='Countries', y_label='Frequency',
              x_attribute=top_10_countries.index, y_attribute=top_10_countries.values)


In [None]:
# Top 10 frequent countries top 3 frequent provinces
top_countries_df = location_df[location_df['Country_Region'].isin(top_10_countries.index)]
top_provs = []
for c in top_10_countries.index:
    temp = top_countries_df[top_countries_df['Country_Region'] == c]
    provs = temp['Province_State'].value_counts().nlargest(3).index
    top_provs.extend(provs)
top_provs_df = location_df[location_df["Province_State"].isin(top_provs)]
plot_countplot(df=top_provs_df, title='Top Countries"s Top 3 provinces (location)', x_label='Countries',
               y_label='Frequency', x_attribute='Country_Region', hue='Province_State', hue_order=top_provs,
               class_order=top_10_countries.index)

In [None]:
# top incidence rate regions
top_incidence = location_df.sort_values(by='Incidence_Rate', ascending=False).head(5)
plot_bargraph(title='Top 5 Incidence Rate regions (location)', x_label='Region', y_label='Incidence rate',
              x_attribute=top_incidence['Combined_Key'],
              y_attribute=[float(i) for i in top_incidence['Incidence_Rate']])

In [None]:
# top case fatility rate regions
top_incidence = location_df.sort_values(by='Case-Fatality_Ratio', ascending=False).head(5)
plot_bargraph(title='Top 5 Case Fatility Rate regions (location)', x_label='Region',
              y_label='Case Fatality Ratio rate', x_attribute=top_incidence['Combined_Key'],
              y_attribute=[float(i) for i in top_incidence['Case-Fatality_Ratio']])

