In [1]:
import pandas as pd
import numpy as np

In [11]:
people = {
    "first": ["Corey", 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    "last": ["Schafer", 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@gmail.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [17]:
df = pd.DataFrame(people)
df.replace(['NA', 'Missing'], np.nan, inplace=True)

In [18]:
df #dataframe

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@gmail.com,
6,,,,


In [6]:
df.dropna()
#elimina las rows que tienen valores NaN o None

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,Missing,Missing


In [19]:
df.dropna(axis='index', how='all', subset=['last', 'email'])
#por default
#el parametro axis puede tener 'index' (elimina la row) o 'column' (elimina la col)
#el parametro how define el criterio de eliminacion: 'any' es default, significa que borrara la row cuando algun valor sea nulo
#puede tener tambien el argumento all, que la borrara solo cuando todos los valores sea nulos
#el parametro subset es para especificar que columna queremos que no sea nula

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@gmail.com,


In [20]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [22]:
df.fillna('-')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,-,36
4,-,-,-,-
5,-,-,Anonymous@gmail.com,-
6,-,-,-,-


In [26]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [24]:
df['age'] = df['age'].astype(float)

In [25]:
df['age']

0    33.0
1    55.0
2    63.0
3    36.0
4     NaN
5     NaN
6     NaN
Name: age, dtype: float64

In [30]:
df.mean(numeric_only=True)
df['age'].mean()

46.75

In [5]:
df['email'] #series

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [6]:
type(df['email'])

pandas.core.series.Series

Series = Rows from one column, one-dimensional array, list
---
DataFrame = Rows and columns, two-dimensional array, container for multiple series
---

In [7]:
df.email

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [8]:
df[['last', 'email']] #filtered dataframe

Unnamed: 0,last,email
0,Schafer,CoreyMSchafer@gmail.com
1,Doe,JaneDoe@email.com
2,Doe,JohnDoe@email.com


In [9]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [10]:
df.iloc[[0, 1], 2] #accessing a row - integer location

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
Name: email, dtype: object

In [11]:
df.loc[[0, 1], ['email', 'last']]

Unnamed: 0,email,last
0,CoreyMSchafer@gmail.com,Schafer
1,JaneDoe@email.com,Doe


In [12]:
df.set_index('email', inplace=True) #para setear un index usamos inplace, sino no se cambia el DF original

In [13]:
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
CoreyMSchafer@gmail.com,Corey,Schafer
JaneDoe@email.com,Jane,Doe
JohnDoe@email.com,John,Doe


In [14]:
df.index

Index(['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com'], dtype='object', name='email')

In [15]:
df.loc['CoreyMSchafer@gmail.com', 'first']

'Corey'

In [16]:
df.reset_index(inplace=True)
df

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [32]:
filt = (df['last'] == 'Doe') & (df['first'] == 'John') #para usar operadores AND y OR usamos '&' y '|'

In [34]:
df[~filt] #python aca se hace el capo y en vez de usar '!' como todos usa '~'

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe


In [30]:
df.loc[filt, 'first'] #si pasamos una serie de booleans al loc como parametro, podemos filtrar nuestro dataframe

2    John
Name: first, dtype: object

In [42]:
df.columns = [x.lower() for x in df.columns]

In [43]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [47]:
df.rename(columns={'first': 'first_name', 'last': 'last_name'}, inplace=True)

In [50]:
df

Unnamed: 0,first_name,last_name,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,James,Doe,JamesDoe@email.com


In [49]:
df.loc[2, ['first_name', 'email']] = ['James', 'JamesDoe@email.com']

In [7]:
df['email'] = df['email'].str.lower()
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Doe,johndoe@email.com


In [8]:
df['email'].apply(len)

0    23
1    17
2    17
Name: email, dtype: int64

In [9]:
def update_email(email):
    return email.upper()

In [10]:
df['email'].apply(update_email)

0    COREYMSCHAFER@GMAIL.COM
1          JANEDOE@EMAIL.COM
2          JOHNDOE@EMAIL.COM
Name: email, dtype: object

In [11]:
df['email'].apply(lambda x: x.lower()) #apply funciona para series

0    coreymschafer@gmail.com
1          janedoe@email.com
2          johndoe@email.com
Name: email, dtype: object

In [12]:
df.apply(len) #devuelve la length de cada columna, numero de filas en cada columna

first    3
last     3
email    3
dtype: int64

In [14]:
df.apply(pd.Series.min)

first                      Corey
last                         Doe
email    coreymschafer@gmail.com
dtype: object

In [16]:
df.apply(lambda x: x.min())

first                      Corey
last                         Doe
email    coreymschafer@gmail.com
dtype: object

In [18]:
df.applymap(len) #applymap solo funciona para DataFrames

Unnamed: 0,first,last,email
0,5,7,23
1,4,3,17
2,4,3,17


In [19]:
df.applymap(str.lower)

Unnamed: 0,first,last,email
0,corey,schafer,coreymschafer@gmail.com
1,jane,doe,janedoe@email.com
2,john,doe,johndoe@email.com


In [23]:
df['first'].map({'Corey':'George'})

0    George
1       NaN
2       NaN
Name: first, dtype: object

In [25]:
df.replace('Corey', 'George')

Unnamed: 0,first,last,email
0,George,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Doe,johndoe@email.com


In [26]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschafer@gmail.com
1,Jane,Doe,janedoe@email.com
2,John,Doe,johndoe@email.com


In [27]:
df['full_name'] = df['first'] + ' ' + df['last']

In [28]:
df

Unnamed: 0,first,last,email,full_name
0,Corey,Schafer,coreymschafer@gmail.com,Corey Schafer
1,Jane,Doe,janedoe@email.com,Jane Doe
2,John,Doe,johndoe@email.com,John Doe


In [33]:
df.drop(columns=['first', 'last'], inplace=True)

In [34]:
df

Unnamed: 0,email,full_name
0,coreymschafer@gmail.com,Corey Schafer
1,janedoe@email.com,Jane Doe
2,johndoe@email.com,John Doe


In [38]:
df['full_name'].str.split(' ', expand=True)

Unnamed: 0,0,1
0,Corey,Schafer
1,Jane,Doe
2,John,Doe


In [39]:
df[['first', 'last']] = df['full_name'].str.split(' ', expand=True)

In [40]:
df

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@email.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Doe,John,Doe


In [41]:
df.append({'first': 'Martin'}, ignore_index=True)

  df.append({'first': 'Martin'}, ignore_index=True)


Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@email.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Doe,John,Doe
3,,,Martin,


In [46]:
new_row = pd.Series({'first': 'Martin'})

In [55]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [1]:
people2 = {
    "first": ["Tony", 'Steve'], 
    "last": ["Stark", 'Rogers'], 
    "email": ["IronMan@avenge.com", 'Cap@avenge.com']
}

In [4]:
df2 = pd.DataFrame(people2)

In [5]:
df2

Unnamed: 0,first,last,email
0,Tony,Stark,IronMan@avenge.com
1,Steve,Rogers,Cap@avenge.com


In [8]:
df.append(df2, ignore_index=True)

  df.append(df2, ignore_index=True)


Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Tony,Stark,IronMan@avenge.com
4,Steve,Rogers,Cap@avenge.com


In [15]:
df = df.append(df2, ignore_index=True)

  df = df.append(df2, ignore_index=True)


In [16]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Tony,Stark,IronMan@avenge.com
4,Steve,Rogers,Cap@avenge.com


In [19]:
df.drop(index=4)

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Tony,Stark,IronMan@avenge.com


In [20]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Tony,Stark,IronMan@avenge.com
4,Steve,Rogers,Cap@avenge.com


In [21]:
df[df['last'] == 'Doe'].index

Int64Index([1, 2], dtype='int64')

In [22]:
df.drop(index=df[df['last'] == 'Doe'].index)

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
3,Tony,Stark,IronMan@avenge.com
4,Steve,Rogers,Cap@avenge.com


In [27]:
df.sort_values(by=['last', 'first'], ascending=[False, True])

Unnamed: 0,first,last,email
3,Tony,Stark,IronMan@avenge.com
0,Corey,Schafer,CoreyMSchafer@gmail.com
4,Steve,Rogers,Cap@avenge.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [28]:
df.sort_index()

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Tony,Stark,IronMan@avenge.com
4,Steve,Rogers,Cap@avenge.com


In [29]:
df['last'].sort_values()

1        Doe
2        Doe
4     Rogers
0    Schafer
3      Stark
Name: last, dtype: object