<a href="https://colab.research.google.com/github/olavocarvlho/ironhack-codelabs/blob/master/Axur_Data_Science_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and helpers


In [None]:
# core libraries
import numpy as np
import pandas as pd
pd.pandas.set_option('display.max_columns', None)


# dataviz
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# google drive connector
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/drive')

## Others
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


# Data loading and cleaning

In [None]:
df_diabetes =  pd.read_csv("/content/drive/MyDrive/DS_CHALLENGE/diabetic_data.csv", sep=";", na_values="?")
print(df_diabetes.shape)

In [None]:
list_codes = []
with open("/content/drive/MyDrive/DS_CHALLENGE/diag_codes.txt", "r") as diag_codes:
  for line in diag_codes:
    stripped_line = line.strip()
    stripped_line = stripped_line.replace("('", "")
    stripped_line = stripped_line.replace("', ", ",")
    stripped_line = stripped_line.replace(",(", ",")
    stripped_line = stripped_line.replace("),", "")
    list_codes.append(stripped_line.split(','))

df_codes = pd.DataFrame(list_codes, columns=["diagnostic", "range"])

df_codes.head(20)

In [None]:
def collapse_diag(df, column):
    if df[column] == np.nan:
        return "Other"
    elif str(df[column])[0] == "E" :
        return "Other"
    elif str(df[column])[0] == "V" :
        return "Other"
    else:
        num = float(df[column])
        if np.trunc(num) <= 139 :
            return "infections"
        elif np.trunc(num) <= 239 :
            return "neoplasms"
        elif np.trunc(num) <= 279 :
            return "endocrine"
        elif np.trunc(num) <= 289 :
            return "blood"
        elif np.trunc(num) <= 319 :
            return "mental"
        elif np.trunc(num) <= 359 :
            return "nervous"
        elif np.trunc(num) <= 389 :
            return "sense"
        elif np.trunc(num) <= 459 :
            return "circulatory"
        elif np.trunc(num) <= 519 :
            return "respiratory"
        elif np.trunc(num) <= 579 :
            return "digestive"
        elif np.trunc(num) <= 629 :
            return "genitourinary"
        elif np.trunc(num) <= 679 :
            return "pregnancy"
        elif np.trunc(num) <= 709 :
            return "skin"
        elif np.trunc(num) <= 739 :
            return "musculoskeletal"
        elif np.trunc(num) <= 759 :
            return "congenital"
        elif np.trunc(num) <= 779 :
            return "perinatal"
        elif np.trunc(num) <= 799 :
            return "ill-defined"
        elif np.trunc(num) <= 999 :
            return "injury"
        else :
            return np.trunc(num)

## Pre analysis

The dataset Contains 101.766 observations and 21 features


In [None]:
df_diabetes.info()

In [None]:
df_diabetes.isnull().sum()

In [None]:
df_diabetes.sample(7).T 

In [None]:
for column in df_diabetes.columns:
  print(column, df_diabetes[column].unique())
  print('\n') 

## Drop bad data

In [None]:
df_diabetes["gender"].value_counts()

In [None]:
df_diabetes.drop(df_diabetes.loc[df_diabetes["gender"]=="Unknown/Invalid"].index, axis=0, inplace=True)

In [None]:
df_diabetes.drop(df_diabetes.loc[df_diabetes["race"]=="?"].index, axis=0, inplace=True)

In [None]:
df_diabetes.drop("weight", axis=1, inplace=True)

In [None]:
df_diabetes.drop("citoglipton", axis=1, inplace=True)

In [None]:
df_diabetes.drop("max_glu_serum", axis=1, inplace=True)

## Collapse features

### Age

In [None]:
df_diabetes["age"] = df_diabetes["age"].str[1:3] 
df_diabetes["age"] = df_diabetes["age"].replace("0-", "0")
df_diabetes["age"].value_counts()

In [None]:
df_diabetes["age"] = df_diabetes["age"].replace("0", "40")
df_diabetes["age"] = df_diabetes["age"].replace("10", "40")
df_diabetes["age"] = df_diabetes["age"].replace("20", "40")
df_diabetes["age"] = df_diabetes["age"].replace("30", "40")

### Admission Type ID

In [None]:
df_diabetes["admission_type_id"] = df_diabetes["admission_type_id"].map({1.0:"Emergency", 
                                                                         2.0:"Emergency", 
                                                                         3.0:"Elective", 
                                                                         4.0:"New Born", 
                                                                         5.0:"Other", 
                                                                         6.0:"Other", 
                                                                         7.0:"Emergency", 
                                                                         8.0:"Other"})

print(df_diabetes["admission_type_id"].value_counts())

### Medical Specialty

In [None]:
print(df_diabetes["medical_specialty"].value_counts(normalize=True))
print(df_diabetes["medical_specialty"].value_counts().nlargest(10))

In [None]:
top_9 = ["InternalMedicine",
"Emergency/Trauma",
"Family/GeneralPractice",
"Cardiology",
"Surgery-General",
"Nephrology",
"Orthopedics",
"Orthopedics-Reconstructive",
"Radiologist"]

df_diabetes.loc[~df_diabetes["medical_specialty"].isin(top_9),"medical_specialty"] = "Other"
print(df_diabetes["medical_specialty"].value_counts(normalize=True))


### Diagnosis 1 and 2

In [None]:
df_diabetes["diag_1_collapse"] = df_diabetes.apply(collapse_diag, axis=1, column="diag_1")
df_diabetes["diag_2_collapse"] = df_diabetes.apply(collapse_diag, axis=1, column="diag_2")

In [None]:
df_diabetes["diag_1_collapse"].value_counts(normalize=True)

In [None]:
df_diabetes["diag_2_collapse"].value_counts(normalize=True)

In [None]:
df_diabetes.drop("diag_1", axis=1, inplace=True)
df_diabetes.drop("diag_2", axis=1, inplace=True)

### Max Glu Serum

In [None]:
df_diabetes["max_glu_serum"] = df_diabetes["max_glu_serum"].replace({">200":"High",
                                                                     ">300":"High"}) 

## Drop duplicates

# Exploration and Visualization

## Readmitted
###### Type: **Categorical**
Days to inpatient readmission. Represents our **target feature**.
* No readmission  (~ 54%)
* Readmitted within 30 days (~11%)
* Readmitted after more than 30 days (~35%)


Our target variable is imbalanced. Number of **">30"**  are quite less as compared to **"NO"** and **"<30"**.



In [None]:
sns.countplot(x="readmitted", data=df_diabetes)
plt.title("Distribution of Target Values")
plt.show()

print(df_diabetes["readmitted"].value_counts(normalize=True))

## Encounter ID
###### Type: **Numeric**


## Patient NBR
###### Type: **Numeric**


## Race
###### Type: **Categorical**
We have 5 different races value, these are;
- Caucasian          
- AfricanAmerican    
- Hispanic            
- Asian
- Other 


In [None]:
def plotRace():
  sns.countplot(x=df_diabetes["race"], data=df_diabetes)
  plt.xticks(rotation=90)
  plt.show()

plotRace()
print("Proportion of Race")
print(df_diabetes["race"].value_counts(normalize=True))

- **76 percent** of patients are Caucasian 
- Other **24 percent** is divided into African Americans, Hispanics, Asians and Others.

Here we decided to collapse into **Caucasian**, **African American** and **Other**.

In [None]:
df_diabetes["race"] = df_diabetes["race"].map({"Asian":"Other","Hispanic":"Other"})
plotRace()

## Gender
###### Type: **Categorical**
- We see a nearly equal distribution of **Gender**.

In [None]:
sns.countplot(x="gender", data=df_diabetes)
plt.show()

print(df_diabetes["gender"].value_counts(normalize=True))

## Age
###### Type: **Categorical**
- We can state that we have **elderly** patients

In [None]:
df_diabetes['age'].hist()

print(df_diabetes["age"].value_counts(normalize=True))

## Weight
##### Type: **Numeric**<br>
Description: Contains ~98% of missing values so there is no significance in filling those missing values, **dropped**.

## Admission Type ID
###### Type: **Categorical**
Integer identifier corresponding to:
- **1** Emergency
- **2** Urgent
- **3** Elective
- **4** Newborn
- **5** Not Available
- **6** NULL
- **7** Trauma Center
- **8** Not Mapped

We mapped **Urgent** and **Trauma Center** values as **Emergency** since they are all non elective admissions.

## Time in hospital
###### Type: **Categorical**
Number of days between admission and discharge.

In [None]:
sns.countplot(x="time_in_hospital", data=df_diabetes)
plt.show()

print(df_diabetes["time_in_hospital"].value_counts(normalize=True))

## Medical Specialty
###### Type: **Categorical**


In [None]:
sns.countplot(x="medical_specialty", data=df_diabetes)
plt.xticks(rotation=90)
plt.show() 



## Number of Lab Procedures
###### Type: **Numeric**
Number of lab tests during the encounter

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(16, 9)

sns.countplot(x="num_lab_procedures", data=df_diabetes, ax=ax)
plt.show() 

print(df_diabetes["num_lab_procedures"].value_counts(normalize=True))

In [None]:
sns.boxplot(x="num_lab_procedures", y="readmitted", data=df_diabetes)

## Number of Procedures
###### Type: **Numeric**
Number of procedures during the encounter

In [None]:
sns.countplot(x="num_procedures", data=df_diabetes)
plt.show() 

print(df_diabetes["num_procedures"].value_counts(normalize=True))

## Number of Medications
###### Type: **Numeric**


In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(16, 9)

sns.countplot(x="num_medications", data=df_diabetes, ax=ax)
plt.show() 

print(df_diabetes["num_medications"].value_counts(normalize=True))

In [None]:
sns.boxplot(x="num_medications", y="readmitted", data=df_diabetes)

## Number of Emergency
###### Type: **Numeric**


## Diagnoses 1 and 2
###### Type: **Categorical**



## Max Glu Serum
###### Type: **Categorical**
Indicates the range of the result or if the test was not taken.<br> 
Values: 
- “>200,” 
- “>300,”
- “normal,” 
- “none” if not measured<br>

Since  ~94% of values are none, there is no significance in keep this features, so **dropped**.

## Number of diagnoses
###### Type: **Numeric**


## Citoglipton
###### Type: **Categorical**


## Insulin
###### Type: **Categorical**


## Diabetes Medication
###### Type: **Categorical**
Indicates if there was any diabetic medication prescribed.

In [None]:
sns.countplot(x="diabetesMed", data=df_diabetes)
plt.show()

# Feature Engineering

## Re-enconding

In [None]:
df_diabetes = df_diabetes.replace({"NO":1, "<30":2, ">30":3})

## Data Balancing

### Display numeric variables and the percentage of missing values

In [None]:
# show variables and the percentage of missing values 

num_features = []
for column in df_diabetes.columns:
    if df_diabetes[column].dtype == "int64":
        num_features.append(column)
        count_null = df_diabetes[column].isnull().sum()
        print(column,
              count_null,
              np.round(count_null/df_diabetes.shape[0],2)*100)

In [None]:
# exploring unique values in each column
for col in df_diabetes.columns:
  print(col, df_diabetes[col].unique())


In [None]:
df_no_diag = df_diabetes[(df_diabetes['diag_1'] == '?') & (df_diabetes['diag_2'] == '?')]

df_no_diag.index[0]

In [None]:
df_diabetes_clean = df_diabetes[(df_diabetes["diag_1"] != "?") & (df_diabetes["diag_2"] != "?")]
print(df_diabetes_clean.shape[0] - df_diabetes.shape[0])

###  Look for discrete features

In [None]:
discrete_features = [feature for feature in num_features if len(df_diabetes_clean[feature].unique())<30]

print(discrete_features)

#### Discrete features
'admission_type_id'
'time_in_hospital'
'num_procedures'
'number_diagnoses'

#### 

In [None]:
# Medical_specialty has 49% missing values, will drop for now and revisit if necessary
# Weight has 97% of missing values, best thing is to just drop it.
# Primary (diag_1), Secondary (diag_2) and Additional (diag_3) diagnoses were have very few missing values. Technically, if all three are missing, that’s bad data.
# Since we are trying to predict readmissions, those patients who died during this hospital admission, have zero probability of readmission. 

df_diabetes_clean = df_diabetes.drop(['weight'], axis=1)