<a href="https://colab.research.google.com/github/pkmariya/Scaler01/blob/master/Apollo_CaseStudy_Mariya_DSML22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Objective
### Apollo wants to know, 
*   which variables are significant in predicting the reason for hospitalization for different regions
*   how well some variables like viral load, smoking, severity level describe the hospitalization charges

#### Import Packages

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

#### Import & Load data

In [14]:
df = pd.read_csv("https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/001/681/original/scaler_apollo_hospitals.csv")

# Statistical Summary

### Meta-data of dataset

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               1338 non-null   int64  
 1   age                      1338 non-null   int64  
 2   sex                      1338 non-null   object 
 3   smoker                   1338 non-null   object 
 4   region                   1338 non-null   object 
 5   viral load               1338 non-null   float64
 6   severity level           1338 non-null   int64  
 7   hospitalization charges  1338 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 83.8+ KB


### Description of the dataframe

In [None]:
df.describe()

Unnamed: 0.1,Unnamed: 0,age,viral load,severity level,hospitalization charges
count,1338.0,1338.0,1338.0,1338.0,1338.0
mean,668.5,39.207025,10.221233,1.094918,33176.058296
std,386.391641,14.04996,2.032796,1.205493,30275.029296
min,0.0,18.0,5.32,0.0,2805.0
25%,334.25,27.0,8.7625,0.0,11851.0
50%,668.5,39.0,10.13,1.0,23455.0
75%,1002.75,51.0,11.5675,2.0,41599.5
max,1337.0,64.0,17.71,5.0,159426.0


In [None]:
df.describe(include='all')

Unnamed: 0.1,Unnamed: 0,age,sex,smoker,region,viral load,severity level,hospitalization charges
count,1338.0,1338.0,1338,1338,1338,1338.0,1338.0,1338.0
unique,,,2,2,4,,,
top,,,male,no,southeast,,,
freq,,,676,1064,364,,,
mean,668.5,39.207025,,,,10.221233,1.094918,33176.058296
std,386.391641,14.04996,,,,2.032796,1.205493,30275.029296
min,0.0,18.0,,,,5.32,0.0,2805.0
25%,334.25,27.0,,,,8.7625,0.0,11851.0
50%,668.5,39.0,,,,10.13,1.0,23455.0
75%,1002.75,51.0,,,,11.5675,2.0,41599.5


In [None]:
# only object type columns
df.describe(include='object')

Unnamed: 0,sex,smoker,region
count,1338,1338,1338
unique,2,2,4
top,male,no,southeast
freq,676,1064,364


### Shape of the data

In [None]:
### Data shape
print("Apollo dataset has {} rows and {} columns.".format(df.shape[0], df.shape[1]))

Apollo dataset has 1338 rows and 8 columns.


### Column Names

In [None]:
# Columna names
df.columns

Index(['Unnamed: 0', 'age', 'sex', 'smoker', 'region', 'viral load',
       'severity level', 'hospitalization charges'],
      dtype='object')

### Column data typess

In [None]:
### Data type of all attributes
df.dtypes

Unnamed: 0                   int64
age                          int64
sex                         object
smoker                      object
region                      object
viral load                 float64
severity level               int64
hospitalization charges      int64
dtype: object

### Size & Count

In [None]:
# total number of elements/records 
df.size

10704

# Data Pre-processing

### Check for Missing Values

In [16]:
# Missing values
df.isnull().sum()

Unnamed: 0                 0
age                        0
sex                        0
smoker                     0
region                     0
viral load                 0
severity level             0
hospitalization charges    0
dtype: int64

In [18]:
df.isnull().sum().sum()

0

#### **Observation**: *There is no missing value in the given dataset for any attribute*

### Check for duplicates

In [19]:
# duplicates
df.duplicated().sum()

0

#### **Observation**: *There is no duplicate value in the given dataset*

In [20]:
df.interpolate()

Unnamed: 0.1,Unnamed: 0,age,sex,smoker,region,viral load,severity level,hospitalization charges
0,0,19,female,yes,southwest,9.30,0,42212
1,1,18,male,no,southeast,11.26,1,4314
2,2,28,male,no,southeast,11.00,3,11124
3,3,33,male,no,northwest,7.57,0,54961
4,4,32,male,no,northwest,9.63,0,9667
...,...,...,...,...,...,...,...,...
1333,1333,50,male,no,northwest,10.32,3,26501
1334,1334,18,female,no,northeast,10.64,0,5515
1335,1335,18,female,no,southeast,12.28,0,4075
1336,1336,21,female,no,southwest,8.60,0,5020


In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,sex,smoker,region,viral load,severity level,hospitalization charges
0,0,19,female,yes,southwest,9.3,0,42212
1,1,18,male,no,southeast,11.26,1,4314
2,2,28,male,no,southeast,11.0,3,11124
3,3,33,male,no,northwest,7.57,0,54961
4,4,32,male,no,northwest,9.63,0,9667


### Delete irrelevant columns

### Dataset Copy

In [38]:
# Take a working copy of the given dataset
work_df = df.copy()

### Delete irrelevant columns

In [51]:
# Delete the columns that are not required/significant
work_df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [52]:
work_df.shape

(1338, 7)

### Data categories

In [53]:
cat_features = [x for x in work_df.columns if work_df[x].dtype == 'object']
cat_features

['sex', 'smoker', 'region']

In [54]:
num_features = [y for y in work_df.columns if work_df[y].dtype not in ('object', 'datetime64[ns')]
num_features

['age', 'viral load', 'severity level', 'hospitalization charges']

In [56]:
date_features = [z for z in work_df.columns if work_df[z].dtype == 'datetime']
date_features

[]