In [1]:
import pandas as pd 
df = pd.read_csv('weather_data.csv', parse_dates = ['day'])
df

# 2017-01-01 instead of 1/1/2017

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [8]:
type(df.day[0])

pandas._libs.tslibs.timestamps.Timestamp

In [9]:
df.temperature[2]

28.0

### Fillna

**Fill all NaN with a specific value**

In [10]:
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [11]:
new_df = df.fillna('XX')
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,XX,9.0,Sunny
2,2017-01-05,28.0,XX,Snow
3,2017-01-06,XX,7.0,XX
4,2017-01-07,32.0,XX,Rain
5,2017-01-08,XX,XX,Sunny
6,2017-01-09,XX,XX,XX
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [12]:
# using columns' names and dictionary

new_df = df.fillna({
    'temperature': 0,
    'windspeed': df['windspeed'].mean(),
    'event' : 'No event'
})

new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,0.0,9.0,Sunny
2,2017-01-05,28.0,8.4,Snow
3,2017-01-06,0.0,7.0,No event
4,2017-01-07,32.0,8.4,Rain
5,2017-01-08,0.0,8.4,Sunny
6,2017-01-09,0.0,8.4,No event
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [13]:
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


**different methods of filling data**

In [14]:
new_df = df.fillna(method = 'ffill')
new_df

# assignes the NaN value(s) to the previous known value

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,32.0,9.0,Sunny
2,2017-01-05,28.0,9.0,Snow
3,2017-01-06,28.0,7.0,Snow
4,2017-01-07,32.0,7.0,Rain
5,2017-01-08,32.0,7.0,Sunny
6,2017-01-09,32.0,7.0,Sunny
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [16]:
new_df = df.fillna(method = 'bfill')
new_df

# assignes the NaN value(s) to the next known value

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,28.0,9.0,Sunny
2,2017-01-05,28.0,7.0,Snow
3,2017-01-06,32.0,7.0,Rain
4,2017-01-07,32.0,8.0,Rain
5,2017-01-08,34.0,8.0,Sunny
6,2017-01-09,34.0,8.0,Cloudy
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


**use of axis**

In [18]:
new_df = df.fillna(method ='bfill', axis = 'columns')
new_df

# goes across columns, last column's values still NaN

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,9.0,9.0,Sunny
2,2017-01-05,28.0,Snow,Snow
3,2017-01-06,7.0,7.0,
4,2017-01-07,32.0,Rain,Rain
5,2017-01-08,Sunny,Sunny,Sunny
6,2017-01-09,NaT,NaT,NaT
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


**limit parameter**

In [19]:
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [20]:
new_df = df.fillna(method = 'ffill', limit =2)
new_df

# only applies ffill method to at most 2 consecutive NaNs

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,32.0,9.0,Sunny
2,2017-01-05,28.0,9.0,Snow
3,2017-01-06,28.0,7.0,Snow
4,2017-01-07,32.0,7.0,Rain
5,2017-01-08,32.0,7.0,Sunny
6,2017-01-09,32.0,,Sunny
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


### Interpolate

In [2]:
new_df = df.interpolate()
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,30.0,9.0,Sunny
2,2017-01-05,28.0,8.0,Snow
3,2017-01-06,30.0,7.0,
4,2017-01-07,32.0,7.25,Rain
5,2017-01-08,32.666667,7.5,Sunny
6,2017-01-09,33.333333,7.75,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [3]:
df.set_index('day', inplace = True)


In [4]:
new_df = df.interpolate(method = 'time')
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


**notice the above temperature on 2017-1-4 was 29.0 instead 0f 30.0 in the plain linear interpolation**

**There are many was for interpolation such as quadratic, piecewise_polynomial ,cubic etc. Google 'DF Interpolate' to see the full documentation**

### Dropna

In [5]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [6]:
new_df = df.dropna()
new_df # prints only the three rows that don't have NaNs 

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [7]:
new_df = df.dropna(how = 'all')
new_df # removes the rows with all NaNs 

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [15]:
new_df = df.dropna(thresh=2) 
new_df

# for a row, it must has at least the value of thresh to survive

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-07,32.0,,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### Inserting Missing Dates

In [30]:
dt = pd.date_range("01-01-2017", "01-11-2017")
idx = pd.DatetimeIndex(dt)
df.reindex(idx)

Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,,,
2017-01-03,,,
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy


In [32]:
newdf = df.drop(["windspeed"], axis = 1) #drop columns
newdf

Unnamed: 0_level_0,temperature,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01,32.0,Rain
2017-01-04,,Sunny
2017-01-05,28.0,Snow
2017-01-06,,
2017-01-07,32.0,Rain
2017-01-08,,Sunny
2017-01-09,,
2017-01-10,34.0,Cloudy
2017-01-11,40.0,Sunny
