In [1]:
import pandas as pd
import numpy as np

In [13]:
df = pd.DataFrame({'A':[np.nan, 13, 14, 15, 16, np.nan],
                    'B':[12, np.nan, 14, 15, 16, np.nan],
                  'C':[7, 8, 9, np.nan, 12, np.nan],
                  'X':[19, 4, 6, np.nan, 7, np.nan]})

In [14]:
df

Unnamed: 0,A,B,C,X
0,,12.0,7.0,19.0
1,13.0,,8.0,4.0
2,14.0,14.0,9.0,6.0
3,15.0,15.0,,
4,16.0,16.0,12.0,7.0
5,,,,


In [15]:
# Methods of dropping N.As in df

# df.dropna(how='all') # drops all
# df.dropna(how='any') # drops any - not needed: a lot of data gets deleted

# Appreciated, but not very wise:
# df.fillna(value=0) # replaces missing values with zero

# Filling in missing values in column 'X' with the mean of the values in that column
df['X'] = df['X'].fillna(df['X'].mean())
print("\nDataFrame after filling missing values in 'X' with the mean:")
print(df)


DataFrame after filling missing values in 'X' with the mean:
      A     B     C     X
0   NaN  12.0   7.0  19.0
1  13.0   NaN   8.0   4.0
2  14.0  14.0   9.0   6.0
3  15.0  15.0   NaN   9.0
4  16.0  16.0  12.0   7.0
5   NaN   NaN   NaN   9.0


In [19]:
'''
Forward fill (ffill)
any missing values (NaN) in the DataFrame will be replaced 
with the last non-missing value encountered in the column. 

Example:
input -> [1, NaN, 2, NaN, NaN, 3]
output -> [1, 1, 2, 2, 2, 3]
'''

df.fillna(method='ffill')

Unnamed: 0,A,B,C,X
0,,12.0,7.0,19.0
1,13.0,12.0,8.0,4.0
2,14.0,14.0,9.0,6.0
3,15.0,15.0,9.0,9.0
4,16.0,16.0,12.0,7.0
5,16.0,16.0,12.0,9.0


In [21]:
from sklearn.impute import SimpleImputer

In [22]:
imp = SimpleImputer(strategy='mean')

In [23]:
a = imp.fit_transform(df)

In [24]:
a


array([[14.5 , 12.  ,  7.  , 19.  ],
       [13.  , 14.25,  8.  ,  4.  ],
       [14.  , 14.  ,  9.  ,  6.  ],
       [15.  , 15.  ,  9.  ,  9.  ],
       [16.  , 16.  , 12.  ,  7.  ],
       [14.5 , 14.25,  9.  ,  9.  ]])

In [26]:
df = pd.DataFrame(a, columns=['A', 'B', 'C', 'X'])

In [27]:
df

Unnamed: 0,A,B,C,X
0,14.5,12.0,7.0,19.0
1,13.0,14.25,8.0,4.0
2,14.0,14.0,9.0,6.0
3,15.0,15.0,9.0,9.0
4,16.0,16.0,12.0,7.0
5,14.5,14.25,9.0,9.0


In [None]:
# Exercise

In [28]:
import pandas as pd

In [29]:
import numpy as np

In [30]:
T = np.linspace(0, 2*np.pi, 25)

In [31]:
T

array([0.        , 0.26179939, 0.52359878, 0.78539816, 1.04719755,
       1.30899694, 1.57079633, 1.83259571, 2.0943951 , 2.35619449,
       2.61799388, 2.87979327, 3.14159265, 3.40339204, 3.66519143,
       3.92699082, 4.1887902 , 4.45058959, 4.71238898, 4.97418837,
       5.23598776, 5.49778714, 5.75958653, 6.02138592, 6.28318531])

In [54]:
df_nana = pd.DataFrame({'X': np.sin(T),
                       'X2': np.sin(T),
                       'Y': 0.5 + np.random.randn(25)
                       }, index=T)

In [55]:
df_nana

Unnamed: 0,X,X2,Y
0.0,0.0,0.0,1.062917
0.261799,0.258819,0.258819,0.574862
0.523599,0.5,0.5,1.592486
0.785398,0.7071068,0.7071068,1.267594
1.047198,0.8660254,0.8660254,-1.022223
1.308997,0.9659258,0.9659258,0.816838
1.570796,1.0,1.0,0.073524
1.832596,0.9659258,0.9659258,0.263704
2.094395,0.8660254,0.8660254,1.16945
2.356194,0.7071068,0.7071068,3.902043


In [56]:
# Assigns mising values to df

df_nana.iloc[5:12, 0] = np.nan
df_nana.loc[np.pi] = np.nan
df_nana.iloc[::2, -1] = np.nan

print("df with missing values: ")
print(df_nana)

df_nana

df with missing values: 
                     X            X2         Y
0.000000  0.000000e+00  0.000000e+00       NaN
0.261799  2.588190e-01  2.588190e-01  0.574862
0.523599  5.000000e-01  5.000000e-01       NaN
0.785398  7.071068e-01  7.071068e-01  1.267594
1.047198  8.660254e-01  8.660254e-01       NaN
1.308997           NaN  9.659258e-01  0.816838
1.570796           NaN  1.000000e+00       NaN
1.832596           NaN  9.659258e-01  0.263704
2.094395           NaN  8.660254e-01       NaN
2.356194           NaN  7.071068e-01  3.902043
2.617994           NaN  5.000000e-01       NaN
2.879793           NaN  2.588190e-01 -0.066306
3.141593           NaN           NaN       NaN
3.403392 -2.588190e-01 -2.588190e-01 -0.724922
3.665191 -5.000000e-01 -5.000000e-01       NaN
3.926991 -7.071068e-01 -7.071068e-01  2.491062
4.188790 -8.660254e-01 -8.660254e-01       NaN
4.450590 -9.659258e-01 -9.659258e-01  1.491344
4.712389 -1.000000e+00 -1.000000e+00       NaN
4.974188 -9.659258e-01 -9.659258e-0

Unnamed: 0,X,X2,Y
0.0,0.0,0.0,
0.261799,0.258819,0.258819,0.574862
0.523599,0.5,0.5,
0.785398,0.7071068,0.7071068,1.267594
1.047198,0.8660254,0.8660254,
1.308997,,0.9659258,0.816838
1.570796,,1.0,
1.832596,,0.9659258,0.263704
2.094395,,0.8660254,
2.356194,,0.7071068,3.902043


In [53]:
# Drop rows where all the values are missing
df_nana.dropna(how='all', axis=0, inplace=True)

# Display the DataFrame after dropping rows
print("\nDataFrame after dropping rows where all values are missing:")

# rows with values that are missing across all columns on a row are droppedx
df_nana


DataFrame after dropping rows where all values are missing:


Unnamed: 0,X,X2,Y
0.0,0.0,0.0,
0.261799,0.258819,0.258819,0.922011
0.523599,0.5,0.5,
0.785398,0.7071068,0.7071068,1.278091
1.047198,0.8660254,0.8660254,
1.308997,,0.9659258,2.472368
1.570796,,1.0,
1.832596,,0.9659258,1.068948
2.094395,,0.8660254,
2.356194,,0.7071068,1.322275
