In [36]:
import pandas as pd
df = pd.read_csv('../datasets/weather_data.csv', parse_dates=["day"])
# parse_dates parses day column into date types from strings
# df

In [35]:
df.set_index("day", inplace=True)
# You have got to do inplace=True otherwise it will NOT modify the original dataframe 
#But instead it will return a new dataframe  

In [26]:
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [40]:
new_df =  df.fillna(0) #fills all NaN values with 0 into a NEW dataframe

In [41]:
#Filling in NaN values for different columns

new_df = df.fillna({
    'temperature': 0,
    'windspeed': 0,
    'event': 'no_event'
})
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,0.0,9.0,Sunny
2,2017-01-05,28.0,0.0,Snow
3,2017-01-06,0.0,7.0,no_event
4,2017-01-07,32.0,0.0,Rain
5,2017-01-08,0.0,0.0,Sunny
6,2017-01-09,0.0,0.0,no_event
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [39]:
new_df = df.fillna(method="ffill")
#forward fill: Fills NaN values with PREVIOUS datapoints value 
# new_df

another_df = df.fillna(method="bfill", axis="columns") 
#backwards fill: Fills NaN values with NEXT datapoint's value
#axis="columns" now makes us look across horizontally, rather than vertically
# Notice how we copy values left to right, instead of up or down when we have axis="columns" 
another_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01 00:00:00,32,6,Rain
1,2017-01-04 00:00:00,9,9,Sunny
2,2017-01-05 00:00:00,28,Snow,Snow
3,2017-01-06 00:00:00,7,7,
4,2017-01-07 00:00:00,32,Rain,Rain
5,2017-01-08 00:00:00,Sunny,Sunny,Sunny
6,2017-01-09 00:00:00,,,
7,2017-01-10 00:00:00,34,8,Cloudy
8,2017-01-11 00:00:00,40,12,Sunny


In [32]:
new_df = df.fillna(method="ffill", limit=1) #ability to set a limit on how many fills that we do
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,32.0,9.0,Sunny
2,2017-01-05,28.0,9.0,Snow
3,2017-01-06,28.0,7.0,Snow
4,2017-01-07,32.0,7.0,Rain
5,2017-01-08,32.0,,Sunny
6,2017-01-09,,,Sunny
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [38]:
new_df = df.interpolate()
# Takes the intermidete data point between the previous and next data point
# Defaults to linear method if none is specified
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,30.0,9.0,Sunny
2,2017-01-05,28.0,8.0,Snow
3,2017-01-06,30.0,7.0,
4,2017-01-07,32.0,7.25,Rain
5,2017-01-08,32.666667,7.5,Sunny
6,2017-01-09,33.333333,7.75,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [44]:
new_df = df.dropna()
#drops rows with NA values

new_df = df.dropna(how="all")
#drops rows that have all their values   empty 
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [None]:
new_df = df.dropna(thresh=1)
#if I have at least 1 valid values, keep the row