In [1]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Duplicate key in file '/Users/martin/.matplotlib/matplotlibrc', line 2 ('backend: TkAgg')


In [2]:
def get_project_path():
    """
    Function that reads and returns the absolute 
    project path from the environment.
    """
    return os.environ.get("PWD")

def add_project_path():
    """
    Function that adds the project path to sys.path
    if not already present.
    """
    project_path = get_project_path()  # project path
    if project_path not in sys.path:
        sys.path.append(project_path)  # add project path
        
add_project_path()
# absolute import works
from src.helper import transform_cat, is_cat
from src.helper import fix_age, fix_gender, fix_pressure

In [3]:
data_path = os.path.join(get_project_path(), "data/diabetes")

In [4]:
df_train = pd.read_csv(os.path.join(data_path,"diabetes_train_analysis.csv")).set_index('id')
df_test = pd.read_csv(os.path.join(data_path,"diabetes_test_analysis.csv")).set_index('id')
df_train_info = pd.read_csv(os.path.join(data_path, "diabetes_train_info.csv")).set_index('id')
df_test_info = pd.read_csv(os.path.join(data_path, "diabetes_test_info.csv")).set_index('id')
print(df_train.columns)
print(df_train_info.columns)

Index(['cholesterol', 'gluc', 'smoke', 'alco', 'active', 'pressure',
       'diabetes'],
      dtype='object')
Index(['age', 'height', 'weight', 'gender'], dtype='object')


In [5]:
print(df_train.head())
print(df_train_info.head())

      cholesterol gluc  smoke  alco  active pressure  diabetes
id                                                            
62538         low  low      0     0       1   100/80         0
49159         low  low      0     0       1   120/82         0
60683         low  low      0     0       1   120/80         0
42924         low  low      0     0       0   120\80         0
52888         low  low      0     0       0   120/80         0
      age  height  weight gender
id                              
0      50     168    62.0      f
1      55     156    85.0      m
2   18857     165    64.0   male
3   17623     169    82.0      f
4      47     156    56.0      m


In [6]:
diabetes_train = df_train.join(df_train_info)
diabetes_test = df_test.join(df_test_info)

In [7]:
diabetes = pd.concat([diabetes_train.assign(ind="train"), diabetes_test.assign(ind="test")])
# test, train = diabetes[diabetes["ind"].eq("train")], diabetes[diabetes["ind"].eq("test")]

In [8]:
print(diabetes.isna().mean())
print(diabetes.isna().describe(include='all'))
diabetes = diabetes[diabetes["weight"].notna()]  # remove nan

cholesterol    0.000000
gluc           0.000000
smoke          0.000000
alco           0.000000
active         0.000000
pressure       0.000000
diabetes       0.000000
age            0.000000
height         0.000000
weight         0.033114
gender         0.000000
ind            0.000000
dtype: float64
       cholesterol   gluc  smoke   alco active pressure diabetes    age  \
count        70000  70000  70000  70000  70000    70000    70000  70000   
unique           1      1      1      1      1        1        1      1   
top          False  False  False  False  False    False    False  False   
freq         70000  70000  70000  70000  70000    70000    70000  70000   

       height weight gender    ind  
count   70000  70000  70000  70000  
unique      1      2      1      1  
top     False  False  False  False  
freq    70000  67682  70000  70000  


In [9]:
categorical_features = [
    ("cholesterol", ['low', 'medium', 'high']),
    ("gluc", ['low', 'medium', 'high'])
]
for key, categories in categorical_features:
    if not is_cat(diabetes[key]):
        diabetes[key] = transform_cat(diabetes[key].astype("category"), categories)

diabetes["age"] = fix_age(diabetes["age"])
diabetes["gender"] = fix_gender(diabetes["gender"]) 
diabetes["pressure_high"], diabetes["pressure_low"] = fix_pressure(diabetes["pressure"])
diabetes["height"] = diabetes["height"].astype("float")

In [10]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67682 entries, 62538 to 94772
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cholesterol    67682 non-null  int8   
 1   gluc           67682 non-null  int8   
 2   smoke          67682 non-null  int64  
 3   alco           67682 non-null  int64  
 4   active         67682 non-null  int64  
 5   pressure       67682 non-null  object 
 6   diabetes       67682 non-null  int64  
 7   age            67682 non-null  float64
 8   height         67682 non-null  float64
 9   weight         67682 non-null  float64
 10  gender         67682 non-null  int64  
 11  ind            67682 non-null  object 
 12  pressure_high  67682 non-null  float64
 13  pressure_low   67682 non-null  float64
dtypes: float64(5), int64(5), int8(2), object(2)
memory usage: 8.9+ MB


In [22]:
clean_diabetes_dataset = diabetes.drop("pressure", axis=1)

In [23]:
clean_diabetes_dataset

Unnamed: 0_level_0,cholesterol,gluc,smoke,alco,active,diabetes,age,height,weight,gender,ind,pressure_high,pressure_low
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
62538,0,0,0,0,1,0,54.000000,169.0,76.0,1,train,100.0,80.0
49159,0,0,0,0,1,0,49.000000,165.0,65.0,0,train,120.0,82.0
60683,0,0,0,0,1,0,60.169863,170.0,56.0,0,train,120.0,80.0
42924,0,0,0,0,0,0,55.580822,169.0,62.0,0,train,120.0,80.0
52888,0,0,0,0,0,0,44.389041,166.0,67.0,0,train,120.0,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95012,0,0,0,0,0,0,52.010959,168.0,78.0,0,test,120.0,80.0
87194,1,0,0,0,1,0,52.000000,160.0,79.0,0,test,120.0,80.0
92108,0,0,1,1,1,0,55.000000,189.0,72.0,1,test,130.0,80.0
89873,0,0,0,0,1,0,52.000000,159.0,65.0,0,test,120.0,80.0


In [24]:
clean_diabetes_dataset.to_csv(os.path.join(data_path, "diabetes_v3.csv"))