In [1]:
import pandas as pd

df = pd.read_csv("weather_data.csv")
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,,,Sunny
6,1/9/2017,,,
7,1/10/2017,34.1,8.1,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [4]:
# parsing day column from string to date
df = pd.read_csv("weather_data.csv", parse_dates=["day"])
print(df)
print("day type =>", type(df.day[0]))

         day  temperature  windspeed   event
0 2017-01-01         32.0        6.0    Rain
1 2017-01-04          NaN        9.0   Sunny
2 2017-01-05         28.0        NaN    Snow
3 2017-01-06          NaN        7.0     NaN
4 2017-01-07         32.0        NaN    Rain
5 2017-01-08          NaN        NaN   Sunny
6 2017-01-09          NaN        NaN     NaN
7 2017-01-10         34.1        8.1  Cloudy
8 2017-01-11         40.0       12.0   Sunny
day type => <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [5]:
# setting the day column as index, inplace is important to modify the current dataframe
# if not pass inplace will return a new dataframe
df.set_index("day", inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [6]:
# first way to treat nan values is fill with 0, again using inplace
df.fillna(0, inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,0
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,0
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [13]:
# reseting the dataframe to put a medium value instead of 0 and have nan value
import pandas as pd

df = pd.read_csv("weather_data.csv", parse_dates=["day"])
df.set_index("day", inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [16]:
# treating nan values now to put medium value
df.fillna({
  'temperature': df.temperature.mean(),
  'windspeed': df.windspeed.mean(),
  'event': 'No Event'
}, inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,33.22,9.0,Sunny
2017-01-05,28.0,8.42,Snow
2017-01-06,33.22,7.0,No Event
2017-01-07,32.0,8.42,Rain
2017-01-08,33.22,8.42,Sunny
2017-01-09,33.22,8.42,No Event
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [20]:
# another way is use ffill
import pandas as pd

df = pd.read_csv("weather_data.csv", parse_dates=["day"])
df.set_index("day", inplace=True)

# using ffill, get the near value and replace the nan, propagate where have a value forward 
# also can use bfill to propagate backwards
df.fillna(method="ffill", inplace=True)
df

  df.fillna(method="ffill", inplace=True)


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [23]:
import pandas as pd

df = pd.read_csv("weather_data.csv", parse_dates=["day"])
df.set_index("day", inplace=True)

# to be more realistic, we can use the interpolate (linear interpolation)
# which takes the medium value between 2 cells that have a nan between
df.interpolate(inplace=True)

df

  df.interpolate(inplace=True)


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.275,Rain
2017-01-08,32.7,7.55,Sunny
2017-01-09,33.4,7.825,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [25]:
import pandas as pd

df = pd.read_csv("weather_data.csv", parse_dates=["day"])
df.set_index("day", inplace=True)

# we can also drop rows that have na values
# also can specify how 
# df.dropna(how="all", inplace=True) where all drop if all values in a row is nan
# also can set a threshold where drop the defined number of nan
# df.dropna(thresh=2, inplace=True)
df.dropna(inplace=True)

df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny
