# Samples managing NA values

In [5]:
import pandas as pd
pd.__version__

'0.24.1'

#### reading data and converting first column since str to date

In [22]:
df = pd.read_csv('weather_data.csv', sep = ';', parse_dates =['day'])
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,8.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cludy
8,2017-01-11,40.0,12.0,Sunny


#### checking change in colum type

In [23]:
type(df.day[0])

pandas._libs.tslibs.timestamps.Timestamp

#### changing index to day column

In [24]:
df.set_index('day', inplace = True)

In [25]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,8.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cludy
2017-01-11,40.0,12.0,Sunny


#### filling all NA values with 0

In [26]:
new_df = df.fillna(0)

In [27]:
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,8.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,0
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,0
2017-01-10,34.0,8.0,Cludy
2017-01-11,40.0,12.0,Sunny


#### filling specific NA columns with different values

In [28]:
new_df1 = df.fillna({'temperature':0 ,
                    'windspeed': 0,
                    'event': 'no event'})
new_df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,8.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,no event
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,no event
2017-01-10,34.0,8.0,Cludy
2017-01-11,40.0,12.0,Sunny


#### filling NA with forward values

In [31]:
new_df2 = df.fillna(method = 'ffill')
new_df2

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,8.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.0,8.0,Cludy
2017-01-11,40.0,12.0,Sunny


####  filling NA with backward values

In [33]:
new_df3= df.fillna(method = 'bfill')
new_df3

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,28.0,8.0,Sunny
2017-01-05,28.0,7.0,Snow
2017-01-06,32.0,7.0,Rain
2017-01-07,32.0,8.0,Rain
2017-01-08,34.0,8.0,Sunny
2017-01-09,34.0,8.0,Cludy
2017-01-10,34.0,8.0,Cludy
2017-01-11,40.0,12.0,Sunny


#### filling NA in horizontal way

In [34]:
new_df4 = df.fillna(method = "bfill", axis = "columns")
new_df4

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,8,8,Sunny
2017-01-05,28,Snow,Snow
2017-01-06,7,7,
2017-01-07,32,Rain,Rain
2017-01-08,Sunny,Sunny,Sunny
2017-01-09,,,
2017-01-10,34,8,Cludy
2017-01-11,40,12,Sunny


#### filling NA with forward values, only for next 1 row

In [35]:
new_df5 = df.fillna(method = 'ffill', limit = 1)
new_df5

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,8.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,,Sunny
2017-01-09,,,Sunny
2017-01-10,34.0,8.0,Cludy
2017-01-11,40.0,12.0,Sunny


#### filling NA with linear interpolate values

In [36]:
new_df6 = df.interpolate()
new_df6

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,8.0,Sunny
2017-01-05,28.0,7.5,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cludy
2017-01-11,40.0,12.0,Sunny


#### filling NA with linear interpolate values, taking into consideration date column (index)

In [37]:
new_df7 = df.interpolate(method = "time")
new_df7

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,8.0,Sunny
2017-01-05,28.0,7.5,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cludy
2017-01-11,40.0,12.0,Sunny


#### eliminating rows with NA values in any columns

In [38]:
new_df8 = df.dropna()
new_df8

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cludy
2017-01-11,40.0,12.0,Sunny


#### eliminating rows with NA values only in all columns

In [39]:
new_df9 = df.dropna(how = "all")
new_df9

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,8.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cludy
2017-01-11,40.0,12.0,Sunny


#### eliminating rows with NA values all columns

In [42]:
new_df10 = df.dropna(thresh = 1)
new_df10

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,8.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cludy
2017-01-11,40.0,12.0,Sunny


#### eliminating rows with NA, keeping rows with at least 2 columns with values

In [43]:
new_df11 = df.dropna(thresh = 2)
new_df11

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,8.0,Sunny
2017-01-05,28.0,,Snow
2017-01-07,32.0,,Rain
2017-01-10,34.0,8.0,Cludy
2017-01-11,40.0,12.0,Sunny


#### generating date index to complete date range with missing dates

In [46]:
dt = pd.date_range("01-01-2017", "01-11-2017")
idx = pd.DatetimeIndex(dt)
df = df.reindex(idx)
df

Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,,,
2017-01-03,,,
2017-01-04,,8.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cludy
