# Importación pandas y comprobar versión

In [213]:
# pip install ipykernel
import pandas as pd
import numpy as np
import seaborn as sns 

print(pd.__version__)

2.2.1


In [214]:
serie = pd.Series(['Pikachu', 'Charmander', 'Bulbasaur', 'Squirtle', 'Jigglypuff'])
print(serie)
print(type(serie))


0       Pikachu
1    Charmander
2     Bulbasaur
3      Squirtle
4    Jigglypuff
dtype: object
<class 'pandas.core.series.Series'>


In [215]:
datos = np.arange(10,60,10)
etiquetas = ['a', 'b', 'c', 'd', 'e']

serie = pd.Series(datos, index=etiquetas)
print(serie)
print(type(serie))

a    10
b    20
c    30
d    40
e    50
dtype: int32
<class 'pandas.core.series.Series'>


In [216]:
diccionario_pokemon = {
    'Nombre': ['Pikachu', 'Charmander', 'Bulbasaur', 'Squirtle', 'Jigglypuff'],
    'Tipo': ['Eléctrico', 'Fuego', 'Planta', 'Agua', 'Normal'],
    'Nivel': [25, 16, 12, 18, 20],
    'Capturado': [True, False, True, True, False]
}

df_pokemon = pd.DataFrame(diccionario_pokemon)
print(df_pokemon)

       Nombre       Tipo  Nivel  Capturado
0     Pikachu  Eléctrico     25       True
1  Charmander      Fuego     16      False
2   Bulbasaur     Planta     12       True
3    Squirtle       Agua     18       True
4  Jigglypuff     Normal     20      False


In [217]:
print(type(serie))
print(type(diccionario_pokemon))
print(type(df_pokemon))

<class 'pandas.core.series.Series'>
<class 'dict'>
<class 'pandas.core.frame.DataFrame'>


In [218]:
df_pokemon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Nombre     5 non-null      object
 1   Tipo       5 non-null      object
 2   Nivel      5 non-null      int64 
 3   Capturado  5 non-null      bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 257.0+ bytes


# Series

## Indice por defecto

In [219]:
pokemon = pd.Series(['Pikachu', 'Charmander', 'Bulbasaur', 'Squirtle', 'Jigglypuff'])
pokemon

0       Pikachu
1    Charmander
2     Bulbasaur
3      Squirtle
4    Jigglypuff
dtype: object

In [220]:
lista_pokemon = ['Pikachu', 'Charmander', 'Bulbasaur', 'Squirtle', 'Jigglypuff']
pokemon = pd.Series(data=lista_pokemon)
pokemon

0       Pikachu
1    Charmander
2     Bulbasaur
3      Squirtle
4    Jigglypuff
dtype: object

### Añadir elementos

In [221]:
print("Sin resetear los indices")

pokemon = pd.Series(['Pikachu', 'Charmander', 'Bulbasaur', 'Squirtle', 'Jigglypuff'])
nuevos_pokemon = pd.Series(['Snorlax', 'Charizard'])

pokemon_2 = pd.concat([pokemon, nuevos_pokemon])
print(pokemon_2)

print()

print("Indices reseteados")
pokemon_2 = pd.concat([pokemon, nuevos_pokemon],ignore_index=True)
print(pokemon_2)

Sin resetear los indices
0       Pikachu
1    Charmander
2     Bulbasaur
3      Squirtle
4    Jigglypuff
0       Snorlax
1     Charizard
dtype: object

Indices reseteados
0       Pikachu
1    Charmander
2     Bulbasaur
3      Squirtle
4    Jigglypuff
5       Snorlax
6     Charizard
dtype: object


### Eliminar elementos (Por indice)

In [222]:
print("Sin resetear los indices")
print()
pokemon = pd.Series(['Pikachu', 'Charmander', 'Bulbasaur', 'Squirtle', 'Jigglypuff'])
print(pokemon)
pokemon_2 = pokemon.drop([1, 2])
print(pokemon_2)

print()

print("Indices reseteados")
print()
pokemon = pd.Series(['Pikachu', 'Charmander', 'Bulbasaur', 'Squirtle', 'Jigglypuff'])
print(pokemon)
pokemon_2 = pokemon.drop([1, 2]).reset_index(drop=True)
print(pokemon_2)

Sin resetear los indices

0       Pikachu
1    Charmander
2     Bulbasaur
3      Squirtle
4    Jigglypuff
dtype: object
0       Pikachu
3      Squirtle
4    Jigglypuff
dtype: object

Indices reseteados

0       Pikachu
1    Charmander
2     Bulbasaur
3      Squirtle
4    Jigglypuff
dtype: object
0       Pikachu
1      Squirtle
2    Jigglypuff
dtype: object


### Eliminar elementos (Por dato)

In [223]:
print("Sin resetear los indices")

pokemon = pd.Series(['Pikachu', 'Charmander',  'Bulbasaur', 'Squirtle', 'Jigglypuff'])
pokemon_han_muerto = ['Charmander', 'Bulbasaur']
pokemon = pokemon[~pokemon.isin(pokemon_han_muerto)]
print(pokemon)

print()

print("Indices reseteados")
pokemon = pd.Series(['Pikachu', 'Charmander', 'Bulbasaur', 'Squirtle', 'Jigglypuff'])
pokemon_han_muerto = ['Charmander', 'Bulbasaur']
pokemon = pokemon[~pokemon.isin(pokemon_han_muerto)]
pokemon = pokemon.reset_index(drop=True)
print(pokemon)

Sin resetear los indices
0       Pikachu
3      Squirtle
4    Jigglypuff
dtype: object

Indices reseteados
0       Pikachu
1      Squirtle
2    Jigglypuff
dtype: object


## Indice con etiquetas

In [224]:
pokemon_diccionario = {25: 'Pikachu', 4: 'Charmander', 1: 'Bulbasaur', 7: 'Squirtle', 39: 'Jigglypuff'}
pokemon = pd.Series(data=pokemon_diccionario)
print(pokemon)
print()
print(type(pokemon))

25       Pikachu
4     Charmander
1      Bulbasaur
7       Squirtle
39    Jigglypuff
dtype: object

<class 'pandas.core.series.Series'>


### Añadir elementos

In [225]:
print("Sin resetear los indices")

nuevos_pokemon_diccionario = {143: 'Snorlax', 6: 'Charizard'} 
nuevos_pokemon = pd.Series(data=nuevos_pokemon_diccionario)

pokemon_2 = pd.concat([pokemon, nuevos_pokemon])
print(pokemon_2)

print()

print("Indices reseteados")

nuevos_pokemon_diccionario = {143: 'Snorlax', 6: 'Charizard'} 
nuevos_pokemon = pd.Series(data=nuevos_pokemon_diccionario)

pokemon_2 = pd.concat([pokemon, nuevos_pokemon],ignore_index=True)
print(pokemon_2)


Sin resetear los indices
25        Pikachu
4      Charmander
1       Bulbasaur
7        Squirtle
39     Jigglypuff
143       Snorlax
6       Charizard
dtype: object

Indices reseteados
0       Pikachu
1    Charmander
2     Bulbasaur
3      Squirtle
4    Jigglypuff
5       Snorlax
6     Charizard
dtype: object


### Eliminar elementos (Por indice)

In [226]:
pokemon_diccionario = {25: 'Pikachu', 4: 'Charmander', 1: 'Bulbasaur', 7: 'Squirtle', 39: 'Jigglypuff'}
pokemon = pd.Series(data=pokemon_diccionario)
pokemon = pokemon.drop([4, 7])
print(pokemon)

print()

print("Indices reseteados")
print()

pokemon_diccionario = {25: 'Pikachu', 4: 'Charmander', 1: 'Bulbasaur', 7: 'Squirtle', 39: 'Jigglypuff'}
pokemon = pd.Series(data=pokemon_diccionario)
pokemon_2 = pokemon.drop([4, 7]).reset_index(drop=True)
print(pokemon_2)

25       Pikachu
1      Bulbasaur
39    Jigglypuff
dtype: object

Indices reseteados

0       Pikachu
1     Bulbasaur
2    Jigglypuff
dtype: object


## Series con numpy

In [227]:
serie = pd.Series(
    data=np.arange(1, 20, 2).astype(float), 
    index=np.arange(1, 60, 6).astype(int)
)

serie

1      1.0
7      3.0
13     5.0
19     7.0
25     9.0
31    11.0
37    13.0
43    15.0
49    17.0
55    19.0
dtype: float64

## Convertir serie a lista

In [228]:
pokemon_diccionario = {25: 'Pikachu', 4: 'Charmander', 1: 'Bulbasaur', 7: 'Squirtle', 39: 'Jigglypuff'}
pokemon_serie = pd.Series(data=pokemon_diccionario)
print(pokemon_serie)
print()
pokemon_lista = pokemon_serie.tolist()

print(pokemon_lista)
print()
print(type(pokemon_lista))
print()
print(pokemon_lista[0])


25       Pikachu
4     Charmander
1      Bulbasaur
7       Squirtle
39    Jigglypuff
dtype: object

['Pikachu', 'Charmander', 'Bulbasaur', 'Squirtle', 'Jigglypuff']

<class 'list'>

Pikachu


## Convertir serie a diccionario

In [229]:
pokemon = {25: 'Pikachu', 4: 'Charmander', 1: 'Bulbasaur', 7: 'Squirtle', 39: 'Jigglypuff'}
pokemon_series = pd.Series(data=pokemon)
print(pokemon_series)

print()

print(type(pokemon_series))

print()

pokemon_dict = pokemon_series.to_dict()
print(pokemon_dict)

print()

print(type(pokemon_dict))

print()

print(pokemon_dict[25])

25       Pikachu
4     Charmander
1      Bulbasaur
7       Squirtle
39    Jigglypuff
dtype: object

<class 'pandas.core.series.Series'>

{25: 'Pikachu', 4: 'Charmander', 1: 'Bulbasaur', 7: 'Squirtle', 39: 'Jigglypuff'}

<class 'dict'>

Pikachu


## Convertir serie a dataframe

In [230]:
np.random.seed(42)
serie_1 = pd.Series(np.random.choice(["Perro", "Gato", "Caballo"], size=20))
serie_2 = pd.Series(np.random.randint(1,20, size=20))


df = pd.concat([serie_1, serie_2], axis=1)
df.columns = ['Animal', 'Num aleatorio']

print(type(df))
print(df.shape)
print(df.size)

df

<class 'pandas.core.frame.DataFrame'>
(20, 2)
40


Unnamed: 0,Animal,Num aleatorio
0,Caballo,1
1,Perro,12
2,Caballo,12
3,Caballo,17
4,Perro,10
5,Perro,16
6,Caballo,15
7,Gato,15
8,Caballo,19
9,Caballo,12


In [231]:
pokemon = {25: 'Pikachu', 4: 'Charmander', 1: 'Bulbasaur', 7: 'Squirtle', 39: 'Jigglypuff'}

pokemon_series = pd.Series(data=pokemon)

dataframe_pokemon = pd.DataFrame(pokemon_series,columns=['Nombre'])

print(type(dataframe_pokemon))
print(dataframe_pokemon.shape)
print(dataframe_pokemon.size)

dataframe_pokemon

<class 'pandas.core.frame.DataFrame'>
(5, 1)
5


Unnamed: 0,Nombre
25,Pikachu
4,Charmander
1,Bulbasaur
7,Squirtle
39,Jigglypuff


In [232]:
np.random.seed(42)
serie_1 = pd.Series(np.random.choice(["Perro", "Gato", "Caballo"], size=20))
serie_2 = pd.Series(np.random.randint(1,20, size=20))


df = pd.DataFrame({'Animal': serie_1, 'Num aleatorio': serie_2})


print(type(df))
print(df.shape)
print(df.size)

df

<class 'pandas.core.frame.DataFrame'>
(20, 2)
40


Unnamed: 0,Animal,Num aleatorio
0,Caballo,1
1,Perro,12
2,Caballo,12
3,Caballo,17
4,Perro,10
5,Perro,16
6,Caballo,15
7,Gato,15
8,Caballo,19
9,Caballo,12


# DataFrame

In [301]:
import pandas as pd
import numpy as np
import seaborn as sns

In [302]:
titanic_df = sns.load_dataset('titanic')
tips_df = sns.load_dataset('tips')

In [303]:
print(type(titanic_df))
print(type(tips_df))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


## head() & tail()

In [369]:
titanic_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [370]:
titanic_df.head(8)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [306]:
tips_df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [307]:
tips_df.tail(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


## info()

In [308]:
tips_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [309]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [310]:
titanic_df['sex'] = titanic_df['sex'].astype('category')

titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    category
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(3), float64(2), int64(4), object(4)
memory usage: 74.7+ KB


In [311]:
# titanic_df['sex'] = titanic_df['sex'].astype('category')
titanic_df[['sex','alive']] = titanic_df[['sex','alive']].astype('category')

titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    category
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    category
 14  alone        891 non-null    bool    
dtypes: bool(2), category(4), float64(2), int64(4), object(3)
memory usage: 68.7+ KB


In [312]:
titanic_df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


## describe()

In [313]:
tips_df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [314]:
titanic_df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [315]:
# titanic_df.describe().transpose()
titanic_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
sibsp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [316]:
titanic_df.describe(include=['category','int','float'])

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,deck,alive
count,891.0,891.0,891,714.0,891.0,891.0,891.0,891,203,891
unique,,,2,,,,,3,7,2
top,,,male,,,,,Third,C,no
freq,,,577,,,,,491,59,549
mean,0.383838,2.308642,,29.699118,0.523008,0.381594,32.204208,,,
std,0.486592,0.836071,,14.526497,1.102743,0.806057,49.693429,,,
min,0.0,1.0,,0.42,0.0,0.0,0.0,,,
25%,0.0,2.0,,20.125,0.0,0.0,7.9104,,,
50%,0.0,3.0,,28.0,0.0,0.0,14.4542,,,
75%,1.0,3.0,,38.0,1.0,0.0,31.0,,,


In [317]:
titanic_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
survived,891.0,,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
pclass,891.0,,,,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
sex,891.0,2.0,male,577.0,,,,,,,
age,714.0,,,,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
sibsp,891.0,,,,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
parch,891.0,,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
fare,891.0,,,,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292
embarked,889.0,3.0,S,644.0,,,,,,,
class,891.0,3.0,Third,491.0,,,,,,,
who,891.0,3.0,man,537.0,,,,,,,


## shape, index y columns

In [318]:
columnas = titanic_df.columns
print(columnas)
print(type(columnas))

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')
<class 'pandas.core.indexes.base.Index'>


In [319]:
columnas = tips_df.columns
print(columnas)
print(type(columnas))

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')
<class 'pandas.core.indexes.base.Index'>


In [320]:
titanic_df.index

RangeIndex(start=0, stop=891, step=1)

In [321]:
tips_df.index

RangeIndex(start=0, stop=244, step=1)

In [322]:
titanic_df.shape

(891, 15)

In [323]:
tips_df.shape

(244, 7)

## isnull() y notnull()

In [324]:
tips_df.isnull()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
239,False,False,False,False,False,False,False
240,False,False,False,False,False,False,False
241,False,False,False,False,False,False,False
242,False,False,False,False,False,False,False


In [325]:
tips_df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [326]:
titanic_df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [327]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    category
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    category
 14  alone        891 non-null    bool    
dtypes: bool(2), category(4), float64(2), int64(4), object(3)
memory usage: 68.7+ KB


In [328]:
tips_df.notnull().head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True


In [329]:
tips_df.notnull().sum()

total_bill    244
tip           244
sex           244
smoker        244
day           244
time          244
size          244
dtype: int64

In [330]:
titanic_df.notnull().sum()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

## isna(), dropna() y fillna()

In [331]:
# Not available

tips_df.isna().head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False


In [371]:
tips_df.isna().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [333]:
titanic_df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [334]:
titanic_copia = titanic_df.copy()

In [335]:
titanic_copia_limpia = titanic_copia.dropna(axis = 0)

In [336]:
print(titanic_copia.isna().sum())

print()

print(titanic_copia_limpia.isna().sum())

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64


In [337]:
titanic_copia.dropna(axis = 0, inplace=True)
print(titanic_copia.isna().sum())

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64


In [338]:
titanic_copia_2 = titanic_df.copy()

In [339]:
titanic_copia_2.dropna(axis=1,inplace=True)

In [340]:
titanic_copia_2.isna().sum()

survived      0
pclass        0
sex           0
sibsp         0
parch         0
fare          0
class         0
who           0
adult_male    0
alive         0
alone         0
dtype: int64

In [341]:
titanic_copia_2.shape

(891, 11)

In [342]:
titanic_df = sns.load_dataset('titanic')

In [343]:
titanic_copia_3 = titanic_df

In [344]:
titanic_df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [345]:
titanic_copia_3.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [346]:
titanic_copia_3.dropna(axis=0,inplace=True)

In [347]:
titanic_copia_3.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [348]:
titanic_copia_3.shape

(182, 15)

In [349]:
titanic_df.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [350]:
titanic_df.shape

(182, 15)

In [351]:
titanic_df = sns.load_dataset('titanic')

In [352]:
titanic_df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [353]:
titanic_df['age'] = titanic_df['age'].fillna(titanic_df['age'].mean())

In [354]:
titanic_df.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [355]:
titanic_df.tail(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
881,0,3,male,33.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
882,0,3,female,22.0,0,0,10.5167,S,Third,woman,False,,Southampton,no,True
883,0,2,male,28.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
884,0,3,male,25.0,0,0,7.05,S,Third,man,True,,Southampton,no,True
885,0,3,female,39.0,0,5,29.125,Q,Third,woman,False,,Queenstown,no,False
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,29.699118,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [356]:
titanic_df['age'].value_counts()

age
29.699118    177
24.000000     30
22.000000     27
18.000000     26
28.000000     25
            ... 
36.500000      1
55.500000      1
0.920000       1
23.500000      1
74.000000      1
Name: count, Length: 89, dtype: int64

In [357]:
titanic_df[titanic_df['age'] == 0.92]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
305,1,1,male,0.92,1,2,151.55,S,First,child,False,C,Southampton,yes,False


In [358]:
titanic_df['age'] = titanic_df['age'].round().astype(int)

In [359]:
titanic_df.loc[305]

survived                 1
pclass                   1
sex                   male
age                      1
sibsp                    1
parch                    2
fare                151.55
embarked                 S
class                First
who                  child
adult_male           False
deck                     C
embark_town    Southampton
alive                  yes
alone                False
Name: 305, dtype: object

## duplicated() y drop_duplicates()


In [360]:
titanic_df = sns.load_dataset('titanic')

In [361]:
titanic_df.duplicated().head(2)

0    False
1    False
dtype: bool

In [362]:
titanic_df.duplicated().sum()

107

In [363]:
duplicates = titanic_df[titanic_df.duplicated(keep=False)]
duplicates.head(20)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
26,0,3,male,,0,0,7.225,C,Third,man,True,,Cherbourg,no,True
28,1,3,female,,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True
29,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True
32,1,3,female,,0,0,7.75,Q,Third,woman,False,,Queenstown,yes,True
37,0,3,male,21.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
42,0,3,male,,0,0,7.8958,C,Third,man,True,,Cherbourg,no,True
45,0,3,male,,0,0,8.05,S,Third,man,True,,Southampton,no,True
46,0,3,male,,1,0,15.5,Q,Third,man,True,,Queenstown,no,False
47,1,3,female,,0,0,7.75,Q,Third,woman,False,,Queenstown,yes,True


In [364]:
tips_df.duplicated().sum()

1

In [365]:
duplicates = tips_df[tips_df.duplicated(keep=False)]
print(duplicates)

     total_bill  tip     sex smoker   day   time  size
198        13.0  2.0  Female    Yes  Thur  Lunch     2
202        13.0  2.0  Female    Yes  Thur  Lunch     2


In [366]:
tips_df_2 = tips_df.drop_duplicates()

In [367]:
tips_df_2[198:204]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
198,13.0,2.0,Female,Yes,Thur,Lunch,2
199,13.51,2.0,Male,Yes,Thur,Lunch,2
200,18.71,4.0,Male,Yes,Thur,Lunch,3
201,12.74,2.01,Female,Yes,Thur,Lunch,2
203,16.4,2.5,Female,Yes,Thur,Lunch,2
204,20.53,4.0,Male,Yes,Thur,Lunch,4


In [372]:

tips_df_3 = tips_df.drop_duplicates(keep='last')
tips_df_3[197:204]