In [13]:
import pandas as pd
import numpy as np

### Dropping items from Series

In [14]:
ser = pd.Series(['a','b','c','d'], index=[1, 2, 3,4])
ser
#assign specific index, 1,2,3,4 to a b c d 


1    a
2    b
3    c
4    d
dtype: object

In [15]:
ser2=ser.drop(2)
ser2
#drop significa que le va a hacer skip, drop de la posicion 2, que en este caso seria b

1    a
3    c
4    d
dtype: object

In [16]:
ser


1    a
2    b
3    c
4    d
dtype: object

In [17]:
ser.drop(3,inplace=True)
ser
#Solo cuando se utiliza Inplace -->
#THE ORIGINAL DATA OBJECT WILL GET MODIFIED, EN LA SERIE ORIGINAL, entonces el 3 se hace drop para siempre

1    a
2    b
4    d
dtype: object

In [12]:
new_ser2=ser.drop([1, 2])
new_ser2

4    d
dtype: object

In [18]:
ser

1    a
2    b
4    d
dtype: object

### Dropping items from a  DataFrame

axis{0 for row, 1 for columns}, default 0

In [19]:
import pandas as pd
data = {'name': ['George', 'Kevin', 'Michael', 'Jose'],
        'age': [35, 20, 26, 25],
        'height': [5.5, 4.9, 5.6, 5.4]}
df = pd.DataFrame(data,index=['a','x','y','z'])
df

Unnamed: 0,name,age,height
a,George,35,5.5
x,Kevin,20,4.9
y,Michael,26,5.6
z,Jose,25,5.4


In [20]:
df.drop(['name', 'age'],axis='columns') #we specify we want to drop age and name, so only height will be displayed

Unnamed: 0,height
a,5.5
x,4.9
y,5.6
z,5.4


In [21]:
df

Unnamed: 0,name,age,height
a,George,35,5.5
x,Kevin,20,4.9
y,Michael,26,5.6
z,Jose,25,5.4


In [22]:
df.drop(['name'],axis='columns',inplace=True)
df #como se pone el inplace, la columna de name ya no va a estar para siempre

Unnamed: 0,age,height
a,35,5.5
x,20,4.9
y,26,5.6
z,25,5.4


In [23]:
df.drop(columns=['age'])

Unnamed: 0,height
a,5.5
x,4.9
y,5.6
z,5.4


In [24]:
df.drop(index=['a', 'x'],inplace=True)
df

Unnamed: 0,age,height
y,26,5.6
z,25,5.4


In [25]:
df.drop(['y']) # drop rows by default

Unnamed: 0,age,height
z,25,5.4


### [Apply Function](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html)

Axis:
*   0 : apply function to each row.
*   1 : apply function to each column.

default 0

In [26]:
df=pd.DataFrame([[60000, 25],[70000, 25],[90000, 25]], columns=['Salary', 'Age'])
df

Unnamed: 0,Salary,Age
0,60000,25
1,70000,25
2,90000,25


In [27]:
df['Salary'].apply(lambda x:x+2000) #cambio temporal 

0    62000
1    72000
2    92000
Name: Salary, dtype: int64

In [28]:
df

Unnamed: 0,Salary,Age
0,60000,25
1,70000,25
2,90000,25


In [30]:
#slary new es una nueva columna que se la va a crear
df['Salary-New']=df['Salary'].apply(lambda x:x+2000) #se utiliza el metodo apply para modificar la columna permanentemente 
df

Unnamed: 0,Salary,Age,Salary-New
0,62000,25,64000
1,72000,25,74000
2,92000,25,94000


### Duplicate Labels

In [32]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [33]:
obj.index.is_unique

False

In [35]:
obj['a'] #va a imprimir both rows, o sea 0 and 1 

a    0
a    1
dtype: int64

### Unique Values, Value Counts

In [36]:
ser = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [37]:
ser.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [38]:
ser.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [39]:
data = {'name': ['Goerge', 'Kevin', 'Michael', 'Jose'],
        'age': [35, 35, 50, 50],
        'height': [5.5, 4.9, 5.6, 5.4],
        'class':['A','B','A','C']}
df = pd.DataFrame(data)
df

Unnamed: 0,name,age,height,class
0,Goerge,35,5.5,A
1,Kevin,35,4.9,B
2,Michael,50,5.6,A
3,Jose,50,5.4,C


In [40]:
df['class'].value_counts()

class
A    2
B    1
C    1
Name: count, dtype: int64

# Handling Missing Data

In [41]:
import pandas as pd
import numpy as np

In pandas, a missing value is mainly represented by NaN.
NaN and NAN are equivalent definitions of nan in numpy. Python's None is also considered a missing value. [Numpy - Constants](https://numpy.org/doc/stable/reference/constants.html)

In [45]:
import pandas as pd
import numpy as np
ser = pd.Series([1, 2, None,500,np.NaN,np.NAN,np.nan])
ser
#hay que incluir el namespace, de np

0      1.0
1      2.0
2      NaN
3    500.0
4      NaN
5      NaN
6      NaN
dtype: float64

In [46]:
ser.isnull()

0    False
1    False
2     True
3    False
4     True
5     True
6     True
dtype: bool

In [47]:
ser[ser.notnull()]

0      1.0
1      2.0
3    500.0
dtype: float64

nan is a floating-point number float, if None is converted to nan, the data type dtype of the column is changed to float, even if the other values are integers int.
None in the object column remains as None.

In [48]:
s_none_int = pd.Series([None, 1, 2])
print(s_none_int)

0    NaN
1    1.0
2    2.0
dtype: float64


### Filtering Out Missing Data

In [49]:
import numpy as np
ser = pd.Series([1, np.nan, 3.5, np.nan, 7])
ser

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [50]:
ser.dropna(inplace=True) #inplace=False

In [51]:
ser

0    1.0
2    3.5
4    7.0
dtype: float64

In [52]:
ser.notnull()

0    True
2    True
4    True
dtype: bool

In [53]:
ser[ser.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [54]:
ser

0    1.0
2    3.5
4    7.0
dtype: float64



```
axis{0 or ‘index’, 1 or ‘columns’}, default 0
Determine if rows or columns which contain missing values are removed.

0, or ‘index’ : Drop rows which contain missing values.

1, or ‘columns’ : Drop columns which contain missing value.
```



In [55]:
df = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
df

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [56]:
cleaned_df = df.dropna(axis=1,how='all')

cleaned_df #nothing got dropped

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0




```
axis:
{0 or ‘index’, 1 or ‘columns’}, default 0
Determine if rows or columns which contain missing values are removed.
0, or ‘index’ : Drop rows which contain missing values.
1, or ‘columns’ : Drop columns which contain missing value.

how:
{‘any’, ‘all’}, default ‘any’
Determine if row or column is removed from DataFrame, when we have at least one NA or all NA.
‘any’ : If any NA values are present, drop that row or column.
‘all’ : If all values are NA, drop that row or column.

inplace:
bool, default False
```



In [57]:
df = pd.DataFrame([[2, 5, 6], [4, np.nan, np.nan],
                     [np.nan, np.nan, np.nan], [np.nan, 3, 7]])
df

Unnamed: 0,0,1,2
0,2.0,5.0,6.0
1,4.0,,
2,,,
3,,3.0,7.0


In [58]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,2.0,5.0,6.0
1,4.0,,
3,,3.0,7.0


In [59]:
df.dropna(how='any')

Unnamed: 0,0,1,2
0,2.0,5.0,6.0


In [60]:
df[4] = np.nan
df

Unnamed: 0,0,1,2,4
0,2.0,5.0,6.0,
1,4.0,,,
2,,,,
3,,3.0,7.0,


In [None]:
df.dropna(axis=1, how='all')

In [None]:
df.dropna(axis=1, how='any')

Filling In Missing Data

In [None]:
df.fillna(0) #inplace=False--> va a llenar todos los na cells con cualqueir valor que se le de 

In [None]:
df

In [None]:
df.fillna({1: 0.5, 2: 0}) # fill by column (0.5 in col 1 and 0 in col 2)

In [None]:
df.fillna({0: df[0].mean()})

In [None]:
def myfunc(x):
  print(x.fillna(0),'\n----------------------')

df.apply(myfunc,axis='columns')

#df.apply(myfunc,axis='rows')
df