In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.notebook_repr_html', False)

In [3]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/jdvelasq/datalabs/master/datasets/iris.csv",
    sep = ',',
    thousands = None,
    decimal = '.')

df.head()

   Sepal_Length  Sepal_Width  Petal_Length  Petal_Width Species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa

## Variables dummy

In [4]:
##
## genera variables dummies para variables categóricas.
##
pd.get_dummies(df.Species)

     setosa  versicolor  virginica
0         1           0          0
1         1           0          0
2         1           0          0
3         1           0          0
4         1           0          0
..      ...         ...        ...
145       0           0          1
146       0           0          1
147       0           0          1
148       0           0          1
149       0           0          1

[150 rows x 3 columns]

## Inserción de filas

In [5]:
## adición de una fila (al final)

u = pd.DataFrame({"Sepal_Length": [1],
                 "Sepal_Width": [1],
                 "Petal_Length": [1],
                 "Petal_Width": [1],
                 "Species": ["setosa"]})

df.append(u, ignore_index = True).tail()

     Sepal_Length  Sepal_Width  Petal_Length  Petal_Width    Species
146           6.3          2.5           5.0          1.9  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica
149           5.9          3.0           5.1          1.8  virginica
150           1.0          1.0           1.0          1.0     setosa

# Casos duplicados y datos faltantes


In [7]:
## se crea un vector aleatorio de indices

u = np.random.choice(
    range(150),
    size = 20,
    replace = False)
u

array([ 15,  78, 128, 103, 122,  89, 106,  58,  67,  72, 110,   6,  84,
       149, 112, 125, 143,  11,  26,  14])

In [8]:
## submuestra de data.frame original "x"
y = df.loc[u].copy()
y.head()

     Sepal_Length  Sepal_Width  Petal_Length  Petal_Width     Species
15            5.7          4.4           1.5          0.4      setosa
78            6.0          2.9           4.5          1.5  versicolor
128           6.4          2.8           5.6          2.1   virginica
103           6.3          2.9           5.6          1.8   virginica
122           7.7          2.8           6.7          2.0   virginica

In [9]:
## cambia los nombres de las filas
y.index = list(range(20))
y

    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width     Species
0            5.7          4.4           1.5          0.4      setosa
1            6.0          2.9           4.5          1.5  versicolor
2            6.4          2.8           5.6          2.1   virginica
3            6.3          2.9           5.6          1.8   virginica
4            7.7          2.8           6.7          2.0   virginica
5            5.5          2.5           4.0          1.3  versicolor
6            4.9          2.5           4.5          1.7   virginica
7            6.6          2.9           4.6          1.3  versicolor
8            5.8          2.7           4.1          1.0  versicolor
9            6.3          2.5           4.9          1.5  versicolor
10           6.5          3.2           5.1          2.0   virginica
11           4.6          3.4           1.4          0.3      setosa
12           5.4          3.0           4.5          1.5  versicolor
13           5.9          3.0     

In [10]:
## de la submuestra "y" se hacen varios registros incompletos
## cambiando varios valores de la columna "Sepal.length" por NA

u = np.random.choice(range(20), size = 10, replace = False)
u

array([ 6, 18, 10,  0,  2,  1,  8, 14, 19,  3])

In [11]:
y.iloc[u]

    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width     Species
6            4.9          2.5           4.5          1.7   virginica
18           5.0          3.4           1.6          0.4      setosa
10           6.5          3.2           5.1          2.0   virginica
0            5.7          4.4           1.5          0.4      setosa
2            6.4          2.8           5.6          2.1   virginica
1            6.0          2.9           4.5          1.5  versicolor
8            5.8          2.7           4.1          1.0  versicolor
14           6.8          3.0           5.5          2.1   virginica
19           5.8          4.0           1.2          0.2      setosa
3            6.3          2.9           5.6          1.8   virginica

In [13]:
## casos con datos faltantes
y.loc[u, "Sepal_Length"] = np.nan
y

    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width     Species
0            NaN          4.4           1.5          0.4      setosa
1            NaN          2.9           4.5          1.5  versicolor
2            NaN          2.8           5.6          2.1   virginica
3            NaN          2.9           5.6          1.8   virginica
4            7.7          2.8           6.7          2.0   virginica
5            5.5          2.5           4.0          1.3  versicolor
6            NaN          2.5           4.5          1.7   virginica
7            6.6          2.9           4.6          1.3  versicolor
8            NaN          2.7           4.1          1.0  versicolor
9            6.3          2.5           4.9          1.5  versicolor
10           NaN          3.2           5.1          2.0   virginica
11           4.6          3.4           1.4          0.3      setosa
12           5.4          3.0           4.5          1.5  versicolor
13           5.9          3.0     

In [14]:
## apilado de dataframes
## Los casos 151 a 170 contienen casos duplicados o
## casos con datos faltantes
w = pd.concat([df, y])
w.index = list(range(170))
w.tail(25)

     Sepal_Length  Sepal_Width  Petal_Length  Petal_Width     Species
145           6.7          3.0           5.2          2.3   virginica
146           6.3          2.5           5.0          1.9   virginica
147           6.5          3.0           5.2          2.0   virginica
148           6.2          3.4           5.4          2.3   virginica
149           5.9          3.0           5.1          1.8   virginica
150           NaN          4.4           1.5          0.4      setosa
151           NaN          2.9           4.5          1.5  versicolor
152           NaN          2.8           5.6          2.1   virginica
153           NaN          2.9           5.6          1.8   virginica
154           7.7          2.8           6.7          2.0   virginica
155           5.5          2.5           4.0          1.3  versicolor
156           NaN          2.5           4.5          1.7   virginica
157           6.6          2.9           4.6          1.3  versicolor
158           NaN   

In [15]:
## casos duplicados
## note que el caso 142 aparece duplicado
w[w.duplicated()]

     Sepal_Length  Sepal_Width  Petal_Length  Petal_Width     Species
142           5.8          2.7           5.1          1.9   virginica
154           7.7          2.8           6.7          2.0   virginica
155           5.5          2.5           4.0          1.3  versicolor
157           6.6          2.9           4.6          1.3  versicolor
159           6.3          2.5           4.9          1.5  versicolor
161           4.6          3.4           1.4          0.3      setosa
162           5.4          3.0           4.5          1.5  versicolor
163           5.9          3.0           5.1          1.8   virginica
165           7.2          3.2           6.0          1.8   virginica
166           6.8          3.2           5.9          2.3   virginica
167           4.8          3.4           1.6          0.2      setosa

In [16]:
# casos únicos o no duplicados
## incluye los casos con valores NA como únicos
## note que se eliminaron varios casos entre el 151 y 170
w.drop_duplicates().tail(20)

     Sepal_Length  Sepal_Width  Petal_Length  Petal_Width     Species
139           6.9          3.1           5.4          2.1   virginica
140           6.7          3.1           5.6          2.4   virginica
141           6.9          3.1           5.1          2.3   virginica
143           6.8          3.2           5.9          2.3   virginica
144           6.7          3.3           5.7          2.5   virginica
145           6.7          3.0           5.2          2.3   virginica
146           6.3          2.5           5.0          1.9   virginica
147           6.5          3.0           5.2          2.0   virginica
148           6.2          3.4           5.4          2.3   virginica
149           5.9          3.0           5.1          1.8   virginica
150           NaN          4.4           1.5          0.4      setosa
151           NaN          2.9           4.5          1.5  versicolor
152           NaN          2.8           5.6          2.1   virginica
153           NaN   

In [17]:
## casos nulos
w["Sepal_Length"].isnull().tail(10)

160     True
161    False
162    False
163    False
164     True
165    False
166    False
167    False
168     True
169     True
Name: Sepal_Length, dtype: bool

In [18]:
w[w["Sepal_Length"].isnull()]

     Sepal_Length  Sepal_Width  Petal_Length  Petal_Width     Species
150           NaN          4.4           1.5          0.4      setosa
151           NaN          2.9           4.5          1.5  versicolor
152           NaN          2.8           5.6          2.1   virginica
153           NaN          2.9           5.6          1.8   virginica
156           NaN          2.5           4.5          1.7   virginica
158           NaN          2.7           4.1          1.0  versicolor
160           NaN          3.2           5.1          2.0   virginica
164           NaN          3.0           5.5          2.1   virginica
168           NaN          3.4           1.6          0.4      setosa
169           NaN          4.0           1.2          0.2      setosa

In [19]:
## casos completos (sin faltantes)
w.dropna().tail(20)

     Sepal_Length  Sepal_Width  Petal_Length  Petal_Width     Species
140           6.7          3.1           5.6          2.4   virginica
141           6.9          3.1           5.1          2.3   virginica
142           5.8          2.7           5.1          1.9   virginica
143           6.8          3.2           5.9          2.3   virginica
144           6.7          3.3           5.7          2.5   virginica
145           6.7          3.0           5.2          2.3   virginica
146           6.3          2.5           5.0          1.9   virginica
147           6.5          3.0           5.2          2.0   virginica
148           6.2          3.4           5.4          2.3   virginica
149           5.9          3.0           5.1          1.8   virginica
154           7.7          2.8           6.7          2.0   virginica
155           5.5          2.5           4.0          1.3  versicolor
157           6.6          2.9           4.6          1.3  versicolor
159           6.3   