In [41]:
#Librerías necesarias
import pandas as pd
pd.set_option('display.max_columns', 25) # Número máximo de columnas a mostrar
pd.set_option('display.max_rows', 50) # Numero máximo de filas a mostar
# Ranom seed
import numpy as np
np.random.seed(3301)

# Seaborn
import seaborn as sns 

# Matplolib
%matplotlib inline
import matplotlib.pyplot as plt

#SKLearn
from sklearn import preprocessing as prc


In [2]:
# Ubicacion de la base de datos
db_location = "data/202120_Laboratorio_1_datos_SaludAlpes_diagnosticos_dataset.csv"

In [3]:
# Leer los datos
df = pd.read_csv(db_location, sep=";")

In [4]:
# Dimensiones de los datos
df.shape

(768, 11)

In [5]:
df.dtypes

Hair color                  object
Pregnancies                 object
Glucose                     object
City                        object
BloodPressure               object
SkinThickness               object
Insulin                     object
BMI                          int64
DiabetesPedigreeFunction    object
Age                          int64
Outcome                     object
dtype: object

In [6]:
# Seleccionar las columnas numericas
# Error: para comparar arreglos entrada a entrada hay que usar | y &
#number_cols = df_tracks.dtypes[df_tracks.dtypes == np.int64 or df_tracks.dtypes == np.float64].index
number_cols = df.dtypes[(df.dtypes == np.int64) | (df.dtypes == np.float64)].index
number_cols = df.select_dtypes(include = ['int','float']).columns
number_cols

Index(['BMI', 'Age'], dtype='object')

In [47]:
df["Hair color"].unique()

array(['Red', 'Black', 'Blue', nan], dtype=object)

In [8]:
df["Pregnancies"].unique()

array(['6', '1', '8', '0', '5', '3', '10', '2', '4', '7', '9', '11', '13',
       '15', '17', '12', '14', '-'], dtype=object)

Nos dimos cuenta que en las columnas, hay valores que deberían ser numéricos pero aparecen como objetos. En el caso de Pregnancies es porque hay valores que son '-' que en este caso asumiremos que son lo mismo que 0. 

In [9]:
df["Pregnancies"] = df["Pregnancies"].replace("-",0)
df["Pregnancies"] = pd.to_numeric(df["Pregnancies"])
df["Pregnancies"].unique()

array([ 6,  1,  8,  0,  5,  3, 10,  2,  4,  7,  9, 11, 13, 15, 17, 12, 14])

In [10]:
df["Glucose"].unique()

array(['148', '85', '183', '89', '137', '116', '78', '115', '197', '125',
       '110', '168', '139', '189', '166', '100', '118', '107', '103',
       '126', '99', '196', '119', '143', '147', '97', '145', '117', '109',
       '158', '88', '92', '122', '138', '102', '90', '111', '180', '133',
       '106', '171', '159', '146', '71', '105', '101', '176', '150', '73',
       '187', '84', '44', '141', '114', '95', '129', '79', '0', '62',
       '131', '112', '113', '74', '83', '136', '80', '123', '81', '134',
       '142', '144', '93', '163', '151', '96', '155', '76', '160', '124',
       '162', '132', '120', '173', '170', '128', '108', '154', '57',
       '156', '153', '188', '152', '104', '87', '75', '179', '130', '194',
       '181', '135', '184', '140', '177', '164', '91', '165', '86', '193',
       '191', '161', '167', '77', '182', '157', '178', '61', '98', '127',
       '82', '72', '172', '94', '175', '195', '68', '186', '198', '121',
       '-', '67', '174', '199', '56', '169', '149

Después de darnos 

In [11]:
df["Glucose"] = pd.to_numeric(df["Glucose"], errors = "coerce")

In [12]:
df["City"].value_counts()

New York    767
-             1
Name: City, dtype: int64

Tomamos la decisión de eliminar la columna de ciudad, ya que todos los valores son New York, y por lo tanto no es una columna con valores relevantes para el diagnóstico de pacientes con diabetes. 

In [13]:
del df["City"]

In [14]:
df["BloodPressure"].unique()

array(['72', '66', '64', '40', '74', '50', '0', '70', '96', '92', '80',
       '60', '84', '30', '88', '90', '94', '76', '82', '75', '58', '78',
       '68', '110', '56', '62', '85', '86', '48', '44', '65', '108', '55',
       '122', '54', '52', '98', '104', '95', '46', '102', '100', '61',
       '-', '24', '38', '106', '114'], dtype=object)

In [15]:
df["BloodPressure"] = pd.to_numeric(df["BloodPressure"], errors = "coerce")
df["BloodPressure"] = df["BloodPressure"].replace(0,np.nan)

In [16]:
df["SkinThickness"].value_counts()

0     226
32     31
30     27
27     23
23     22
     ... 
99      1
-       1
63      1
51      1
56      1
Name: SkinThickness, Length: 52, dtype: int64

In [38]:
df["SkinThickness"] = pd.to_numeric(df["SkinThickness"], errors = "coerce")

In [17]:
df["Insulin"].value_counts()

0      373
105     11
140      9
130      9
120      8
      ... 
270      1
310      1
478      1
545      1
321      1
Name: Insulin, Length: 187, dtype: int64

In [18]:
df["Insulin"] = pd.to_numeric(df["Insulin"], errors = "coerce")

In [19]:
df["Insulin"].unique()

array([  0.,  94., 168.,  88., 543., 846., 175., 230.,  83.,  96., 235.,
       146., 115., 140., 110., 245.,  54., 192., 207.,  70., 240.,  82.,
        36.,  23., 300., 342., 304., 142., 128.,  38., 100.,  90., 270.,
        71., 125., 176.,  48.,  64., 228.,  76., 220.,  40., 152.,  18.,
       135., 495.,  37.,  51.,  99., 145., 225.,  49.,  50.,  92., 325.,
        63., 284., 119., 204., 155., 485.,  53., 114., 105., 285., 156.,
        78., 130.,  55.,  58., 160., 210., 318.,  44., 190., 280.,  87.,
       271., 129., 120., 478.,  56.,  32., 744., 370.,  45., 194., 680.,
       402., 258., 375., 150.,  67.,  57., 116., 278., 122., 545.,  75.,
        74., 182., 360., 215., 184.,  42., 132., 148., 180., 205.,  85.,
       231.,  29.,  68.,  52., 255., 171.,  73., 108.,  43., 167., 249.,
       293.,  66., 465.,  89., 158.,  84.,  72.,  59.,  81., 196., 415.,
       275., 165., 579., 310.,  61., 474., 170., 277.,  60.,  14.,  95.,
       237., 191., 328., 250., 480., 265., 193.,  7

In [20]:
df["BMI"].value_counts()

32     13
316    12
312    12
0      11
324    10
       ..
311     1
307     1
303     1
302     1
671     1
Name: BMI, Length: 248, dtype: int64

In [21]:
df["BMI"] = df["BMI"].replace(0,np.nan)

In [22]:
df["DiabetesPedigreeFunction"].describe()

count     768
unique    514
top       254
freq        6
Name: DiabetesPedigreeFunction, dtype: object

In [23]:
df["DiabetesPedigreeFunction"] = pd.to_numeric(df["DiabetesPedigreeFunction"], errors = "coerce")

In [24]:
df["Age"].describe()

count     768.000000
mean       38.011719
std       117.825600
min        21.000000
25%        24.000000
50%        29.000000
75%        41.000000
max      3256.000000
Name: Age, dtype: float64

In [28]:
df["Age"].value_counts()

22.0    72
21.0    61
25.0    48
24.0    46
23.0    38
        ..
70.0     1
81.0     1
68.0     1
72.0     1
64.0     1
Name: Age, Length: 52, dtype: int64

In [27]:
df.loc[df["Age"] > 82] = np.nan

In [35]:
df["Outcome"].value_counts(normalize = True)

0    0.649673
1    0.350327
Name: Outcome, dtype: float64

In [34]:
df["Outcome"] = df["Outcome"].replace("-",np.nan)

In [50]:
df = df.dropna()

In [51]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,726.0,726.0,726.0,726.0,726.0,726.0,726.0,726.0
mean,3.860882,121.132231,72.393939,21.550964,84.173554,294.61708,436.257576,33.34022
std,3.357534,32.293424,12.385271,15.698874,116.961061,111.174653,335.754678,11.755144
min,0.0,0.0,24.0,0.0,0.0,20.0,1.0,21.0
25%,1.0,99.0,64.0,0.0,0.0,255.0,207.0,24.0
50%,3.0,117.0,72.0,24.0,47.0,312.0,344.5,29.0
75%,6.0,141.75,80.0,33.0,130.0,359.0,597.75,41.0
max,17.0,199.0,122.0,99.0,846.0,671.0,2329.0,81.0


In [None]:
>>> le = preprocessing.LabelEncoder()
>>> le.fit([1, 2, 2, 6])
LabelEncoder()
>>> le.classes_
array([1, 2, 6])
>>> le.transform([1, 1, 2, 6])
array([0, 0, 1, 2]...)
>>> le.inverse_transform([0, 0, 1, 2])
array([1, 1, 2, 6])

In [52]:
le = prc.LabelEncoder()
le.fit(df["Hair color"])

LabelEncoder()

In [54]:
le.classes_

array(['Black', 'Blue', 'Red'], dtype=object)

In [56]:
df["Hair color"] = le.fit_transform(df["Hair color"])

In [57]:
df

Unnamed: 0,Hair color,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,6.0,148.0,72.0,35.0,0.0,336.0,627.0,50.0,1
1,0,1.0,85.0,66.0,29.0,0.0,266.0,351.0,31.0,0
2,2,8.0,183.0,64.0,0.0,0.0,233.0,672.0,32.0,1
3,0,1.0,89.0,66.0,23.0,94.0,281.0,167.0,21.0,0
4,0,0.0,137.0,40.0,35.0,168.0,431.0,2288.0,33.0,1
...,...,...,...,...,...,...,...,...,...,...
763,0,10.0,101.0,76.0,48.0,180.0,329.0,171.0,63.0,0
764,0,2.0,122.0,70.0,27.0,0.0,368.0,34.0,27.0,0
765,2,5.0,121.0,72.0,23.0,112.0,262.0,245.0,30.0,0
766,0,1.0,126.0,60.0,0.0,0.0,301.0,349.0,47.0,1
