# Data Preprocessing for Machine Learning
- Feature Engineering
- Data Cleaning

In [324]:
%run setup.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [325]:
from utils import (smallest_negative_timedelta)

In [326]:
%store -r df

In [327]:
df.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_lat,start_lng,end_lat,end_lng,member_casual,start_is_within_city,end_is_within_city,ride_is_within_city,ride_duration,start_station_id,end_station_id
0,classic_bike,2021-01-01 00:08:33,2021-01-01 00:33:53,38.88044,-77.025238,38.8946,-77.072304,member,True,False,False,1520,514,667.0
1,electric_bike,2021-01-01 00:13:43,2021-01-01 00:29:34,38.917191,-77.025887,38.959999,-77.019997,casual,True,True,True,951,7,
2,electric_bike,2021-01-01 00:14:32,2021-01-01 00:28:45,38.91214,-77.038567,38.920872,-77.031693,member,True,True,True,853,83,40.0
3,classic_bike,2021-01-01 00:15:45,2021-01-01 00:21:20,38.879478,-77.114563,38.879478,-77.114563,member,False,False,False,335,805,805.0
4,classic_bike,2021-01-01 00:17:46,2021-01-01 00:21:00,38.93132,-77.028252,38.931992,-77.032959,member,True,True,True,194,21,50.0


In [328]:
weather_data = pd.read_parquet(config['processed_data_paths']['hourly'])
weather_data = weather_data.set_index('datetime')

weather_data['temperature_apparent'] = thermofeel.kelvin_to_celsius(
    thermofeel.calculate_apparent_temperature(thermofeel.celsius_to_kelvin(weather_data.temperature_2m), 
                                              weather_data.windspeed_10m, 
                                              weather_data.relativehumidity_2m)
                                              )

weather_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 26269 entries, 2021-01-01 00:00:00 to 2023-12-31 23:00:00
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   casual                26269 non-null  int64  
 1   member                26269 non-null  int64  
 2   total_rides           26269 non-null  int64  
 3   relativehumidity_2m   26269 non-null  int64  
 4   temperature_2m        26269 non-null  float64
 5   wmo_code              26269 non-null  int64  
 6   windspeed_10m         26269 non-null  float64
 7   temperature_apparent  26269 non-null  float64
dtypes: float64(3), int64(5)
memory usage: 1.8 MB


# Preprocess Data

In [329]:
# Ensure origin_id != dest_id for actual trips
rides = df[df['start_station_id'] != df['end_station_id']]

In [330]:
rides.columns

Index(['rideable_type', 'started_at', 'ended_at', 'start_lat', 'start_lng',
       'end_lat', 'end_lng', 'member_casual', 'start_is_within_city',
       'end_is_within_city', 'ride_is_within_city', 'ride_duration',
       'start_station_id', 'end_station_id'],
      dtype='object')

# Create hourly data over all stations

In [331]:
grouper_start = pd.Grouper(key='started_at', level=None, freq='h', axis=0, sort=False)
grouper_end = pd.Grouper(key='ended_at', level=None, freq='h', axis=0, sort=False)

In [332]:
# create hourly demand and supply for all station  
hourly_start = rides.groupby(by=grouper_start).agg(cnt_out=('start_station_id','size'), ride_duration_out=('ride_duration','mean'))
hourly_start.index = hourly_start.index.rename('interval')
hourly_start = hourly_start.reset_index()
hourly_start.cnt_out = hourly_start.cnt_out.astype('uint32[pyarrow]')
# hourly_rides_end = pd.merge(hourly_rides_end, stations[['lat_median', 'lng_median']], left_on=['end_station_id'], right_index=True, how='left')
# hourly_rides_end = hourly_rides_end.rename(columns={'ended_at':'hour', 'end_station_id':'station_id', 'lat_median':'lat', 'lng_median':'lng'})
# hourly_rides_end

hourly_end = rides.groupby(by=grouper_end).agg(cnt_in=('end_station_id','size'), ride_duration_in=('ride_duration','mean'))
hourly_end.index = hourly_end.index.rename('interval')
hourly_end = hourly_end.reset_index()
hourly_end.cnt_in = hourly_end.cnt_in.astype('uint32[pyarrow]')
# hourly_rides_start = pd.merge(hourly_rides_start, stations[['lat_median', 'lng_median']], left_on=['start_station_id'], right_index=True, how='left')
# hourly_rides_start = hourly_rides_start.rename(columns={'started_at':'hour', 'start_station_id':'station_id', 'lat_median':'lat', 'lng_median':'lng'})
# hourly_rides_start

hourly_all_stations = pd.merge(hourly_start, hourly_end, on='interval', how='outer')

In [333]:
hourly_all_stations = hourly_all_stations.set_index('interval')

In [334]:
hourly_all_stations.head()

Unnamed: 0_level_0,cnt_out,ride_duration_out,cnt_in,ride_duration_in
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01 00:00:00,29,1095.172414,21,774.809524
2021-01-01 01:00:00,33,870.636364,37,776.72973
2021-01-01 02:00:00,31,660.16129,27,526.0
2021-01-01 03:00:00,11,690.0,17,1511.823529
2021-01-01 04:00:00,10,1120.9,10,1081.3


# Create hourly data per station

In [335]:
# create hourly demand and supply for individual stations
hourly_start = rides.groupby(by=[grouper_start, 'start_station_id']).agg(cnt_out=('start_station_id','size'), ride_duration_out=('ride_duration','mean'))
hourly_start.index = hourly_start.index.rename(('interval', 'station_id'))
hourly_start = hourly_start.reset_index()
hourly_start.cnt_out = hourly_start.cnt_out.astype('uint32[pyarrow]')
# hourly_rides_end = pd.merge(hourly_rides_end, stations[['lat_median', 'lng_median']], left_on=['end_station_id'], right_index=True, how='left')
# hourly_rides_end = hourly_rides_end.rename(columns={'ended_at':'hour', 'end_station_id':'station_id', 'lat_median':'lat', 'lng_median':'lng'})
# hourly_rides_end

hourly_end = rides.groupby(by=[grouper_end, 'end_station_id']).agg(cnt_in=('end_station_id','size'), ride_duration_in=('ride_duration','mean'))
hourly_end.index = hourly_end.index.rename(('interval', 'station_id'))
hourly_end = hourly_end.reset_index()
hourly_end.cnt_in = hourly_end.cnt_in.astype('uint32[pyarrow]')
# hourly_rides_start = pd.merge(hourly_rides_start, stations[['lat_median', 'lng_median']], left_on=['start_station_id'], right_index=True, how='left')
# hourly_rides_start = hourly_rides_start.rename(columns={'started_at':'hour', 'start_station_id':'station_id', 'lat_median':'lat', 'lng_median':'lng'})
# hourly_rides_start

hourly = pd.merge(hourly_start, hourly_end, on=['interval', 'station_id'], how='outer')
# hourly = pd.merge(hourly_start, hourly_end, left_index=True, right_index=True, how='outer')
# hourly = hourly.reset_index()

In [336]:
# set index
hourly = hourly.set_index(['interval', 'station_id'])

# Feature Engineering

## Time Features

In [337]:
def smallest_negative_timedelta(date: pd.Series, date_set):
    # Ensure single_date is a pandas Timestamp for consistent operations
    # date = pd.to_datetime(date)

    # Calculate all timedeltas using broadcasting.
    # series_date_np (N,) becomes (N, 1) and date_set_np (M,) becomes (1, M).
    # The result `timedeltas` will be an (N, M) array.
    timedeltas = date.values[:, np.newaxis] - date_set[np.newaxis, :]
    timedeltas *= -1

    # Find negative timedeltas. Non-negative ones are replaced with NaT.
    # This comparison and selection is also vectorized.
    negative_timedeltas = np.where(timedeltas < pd.Timedelta(0), timedeltas, pd.NaT)

    # Find the maximum (closest to zero) of the negative timedeltas for each row (series_date element).
    # np.nanmax efficiently ignores NaT values when finding the maximum.
    result_values = np.nanmax(negative_timedeltas, axis=1)

    # Convert the resulting NumPy array back to a Pandas Series, preserving the original index.
    return pd.Series(result_values, index=date.index)

In [338]:
# Convert date_set to a NumPy array for efficient broadcasting
us_holidays = np.array(list(pd.Series(US_HOLIDAYS)), dtype='datetime64[ns]')

In [339]:
# Time Features
def add_time_features(df):

    if isinstance(df.index, pd.MultiIndex):
        # Access the first level of the MultiIndex
        interval_index = df.index.get_level_values(0)
    else:
        # Access the single level index
        interval_index = df.index.values

    interval_series = pd.Series(interval_index, name='interval', index=df.index)

    df['is_holiday']                = interval_series.dt.date.isin(US_HOLIDAYS.date)
    df['time_since_last_holiday']   = smallest_negative_timedelta(interval_series, us_holidays) / 1e9 / 60
    df['month']                     = interval_series.dt.month
    df['day_of_week']               = interval_series.dt.day_of_week
    df['time_of_day']               = interval_series.dt.hour
    df['is_weekend']                = interval_series.dt.day_of_week.isin({5,6})#([0,1,2,3,4])
    # return interval_series

add_time_features(hourly_all_stations)
add_time_features(hourly)

  result_values = np.nanmax(negative_timedeltas, axis=1)
  result_values = np.nanmax(negative_timedeltas, axis=1)


In [340]:
hourly

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt_out,ride_duration_out,cnt_in,ride_duration_in,is_holiday,time_since_last_holiday,month,day_of_week,time_of_day,is_weekend
interval,station_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-01-01 00:00:00,1,1,743.0,,,True,,1,4,0,False
2021-01-01 00:00:00,4,,,1,474.0,True,,1,4,0,False
2021-01-01 00:00:00,16,,,1,585.0,True,,1,4,0,False
2021-01-01 00:00:00,19,1,603.0,1,517.0,True,,1,4,0,False
2021-01-01 00:00:00,21,1,194.0,,,True,,1,4,0,False
...,...,...,...,...,...,...,...,...,...,...,...
2024-01-01 02:00:00,144,,,1,7784.0,True,-120.0,1,0,2,False
2024-01-01 02:00:00,598,,,1,10543.0,True,-120.0,1,0,2,False
2024-01-01 06:00:00,282,,,1,32321.0,True,-360.0,1,0,6,False
2024-01-01 12:00:00,621,,,2,3490.5,True,-720.0,1,0,12,False


In [341]:
hourly_all_stations

Unnamed: 0_level_0,cnt_out,ride_duration_out,cnt_in,ride_duration_in,is_holiday,time_since_last_holiday,month,day_of_week,time_of_day,is_weekend
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-01-01 00:00:00,29,1095.172414,21,774.809524,True,,1,4,0,False
2021-01-01 01:00:00,33,870.636364,37,776.729730,True,-60.0,1,4,1,False
2021-01-01 02:00:00,31,660.161290,27,526.000000,True,-120.0,1,4,2,False
2021-01-01 03:00:00,11,690.000000,17,1511.823529,True,-180.0,1,4,3,False
2021-01-01 04:00:00,10,1120.900000,10,1081.300000,True,-240.0,1,4,4,False
...,...,...,...,...,...,...,...,...,...,...
2024-01-01 11:00:00,,,0,,True,-660.0,1,0,11,False
2024-01-01 12:00:00,,,2,3490.500000,True,-720.0,1,0,12,False
2024-01-01 13:00:00,,,0,,True,-780.0,1,0,13,False
2024-01-01 14:00:00,,,0,,True,-840.0,1,0,14,False


## Lag Features

In [342]:
# all stations
# demand: cnt_out
hourly_all_stations['cnt_out_lag1h'] = hourly_all_stations['cnt_out'].shift(freq='1h', periods=-1)
hourly_all_stations['cnt_out_lag1d'] = hourly_all_stations['cnt_out'].shift(freq='1h', periods=-24)
hourly_all_stations['cnt_out_lag1w'] = hourly_all_stations['cnt_out'].shift(freq='1h', periods=-(7*24))

# supply: cnt_in
hourly_all_stations['cnt_in_lag1h'] = hourly_all_stations['cnt_in'].shift(freq='1h', periods=-1)
hourly_all_stations['cnt_in_lag1d'] = hourly_all_stations['cnt_in'].shift(freq='1h', periods=-24)
hourly_all_stations['cnt_in_lag1w'] = hourly_all_stations['cnt_in'].shift(freq='1h', periods=-(7*24))

In [343]:
# hourly_all_stations[['cnt_out', 'cnt_out_lag1h', 'cnt_out_lag1d', 'cnt_out_lag1w']]

In [344]:
hourly_sample = hourly.sample(10000)
# hourly_sample = hourly_sample.drop(columns=['cnt_out_x', 'cnt_out_y', 'cnt_out_lag1h'])
hourly_sample

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt_out,ride_duration_out,cnt_in,ride_duration_in,is_holiday,time_since_last_holiday,month,day_of_week,time_of_day,is_weekend
interval,station_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-01-03 17:00:00,247,,,1,401.000000,False,-2460.0,1,1,17,False
2023-04-12 18:00:00,153,3,1355.666667,2,368.000000,False,-74520.0,4,2,18,False
2022-08-13 19:00:00,195,4,2166.000000,6,1293.500000,False,-58740.0,8,5,19,True
2023-06-22 09:00:00,187,3,793.333333,2,968.500000,False,-4860.0,6,3,9,False
2022-09-25 17:00:00,63,2,671.000000,,,False,-29820.0,9,6,17,True
...,...,...,...,...,...,...,...,...,...,...,...
2021-10-23 23:00:00,17,6,664.000000,,,False,-18660.0,10,5,23,True
2023-06-10 08:00:00,671,,,1,255.000000,False,-17760.0,6,5,8,True
2022-03-07 15:00:00,200,1,2715.000000,,,False,-21060.0,3,0,15,False
2022-05-11 08:00:00,185,,,3,489.666667,False,-114240.0,5,2,8,False


In [345]:
# hourly_sample = hourly_sample.rename({'count_out_x':'count_out'}).drop('cnt_out_y')

In [346]:
hourly_sample.columns

Index(['cnt_out', 'ride_duration_out', 'cnt_in', 'ride_duration_in',
       'is_holiday', 'time_since_last_holiday', 'month', 'day_of_week',
       'time_of_day', 'is_weekend'],
      dtype='object')

In [360]:
def shift_station_timeseries(time_series_df: pd.DataFrame, col, station_id, periods, suffix):
    '''
    time_series: dataframe with 2-level-index (interval, station_id)
    '''
    idx = pd.IndexSlice
    shifted = time_series_df.loc[idx[:, station_id], col].reset_index(level='station_id').shift(freq='1h', periods=periods).set_index('station_id', append=True)
    shifted = shifted.rename(columns={col : col+suffix})
    return shifted
# idx = pd.IndexSlice
# test = hourly.loc[idx[:, 1], 'cnt_out']
# shifted = test.reset_index(level='station_id').shift(freq='1h', periods=-1).set_index('station_id', append=True).rename('cnt_out_lag1h')
# pd.merge(test, shifted, right_index=True, left_index=True, how='outer')
#hourly.loc[idx[:, 1], :]


In [367]:
station_ids = hourly.index.get_level_values('station_id').unique()
shifted_lag1h = pd.concat([shift_station_timeseries(hourly, 'cnt_out', station_id, -1   , '_lag1h') for station_id in station_ids])
shifted_lag1d = pd.concat([shift_station_timeseries(hourly, 'cnt_out', station_id, -24  , '_lag1d') for station_id in station_ids])
shifted_lag1w = pd.concat([shift_station_timeseries(hourly, 'cnt_out', station_id, -24*7, '_lag1w') for station_id in station_ids])

In [368]:
hourly = pd.merge(hourly, shifted_lag1h, right_index=True, left_index=True, how='outer')
hourly = pd.merge(hourly, shifted_lag1w, right_index=True, left_index=True, how='outer')
hourly = pd.merge(hourly, shifted_lag1d, right_index=True, left_index=True, how='outer')

In [369]:
hourly

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt_out,ride_duration_out,cnt_in,ride_duration_in,is_holiday,time_since_last_holiday,month,day_of_week,time_of_day,is_weekend,cnt_out_lag1h,cnt_out_lag1w,cnt_out_lag1d
interval,station_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-12-25 00:00:00,1,,,,,,,,,,,,1,
2020-12-25 00:00:00,4,,,,,,,,,,,,,
2020-12-25 00:00:00,16,,,,,,,,,,,,,
2020-12-25 00:00:00,19,,,,,,,,,,,,1,
2020-12-25 00:00:00,21,,,,,,,,,,,,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-01 06:00:00,282,,,1,32321.0,True,-360.0,1.0,0.0,6.0,False,,,
2024-01-01 11:00:00,621,,,,,,,,,,,,,
2024-01-01 12:00:00,621,,,2,3490.5,True,-720.0,1.0,0.0,12.0,False,,,
2024-01-01 14:00:00,89,,,,,,,,,,,,,


In [None]:
# TODO Review
# # hourly stations
# # hourly['cnt_out_lag1h'] = hourly.index.get_level_values('station_id').map(my_function)

# shift_1h = lambda g: g.reset_index(level='station_id', drop=True)['cnt_out'].shift(freq='1h', periods=-1)
# shift_1d = lambda g: g.reset_index(level='station_id', drop=True)['cnt_out'].shift(freq='1h', periods=-24)
# shift_1w = lambda g: g.reset_index(level='station_id', drop=True)['cnt_out'].shift(freq='1h', periods=-(24*7))

# test = hourly_sample.groupby(level='station_id').apply(shift_1h)

# # hourly['cnt_out_lag1h'] = hourly.groupby(level='interval', group_keys=False)['cnt_out'].transform(lambda x: x.shift(freq='1h'))


# # for name, grp in hourly['cnt_out'].groupby(level='station_id'):
# #     grp['cnt_out_lag1h'] = 0
# # this will not take the value, i guess because it's working on a damn copy


In [177]:
hourly

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt_out,ride_duration_out,cnt_in,ride_duration_in,is_holiday,time_since_last_holiday,month,day_of_week,time_of_day,is_weekend
interval,station_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-01-01 00:00:00,1,1,743.0,,,True,,1,4,0,False
2021-01-01 00:00:00,4,,,1,474.0,True,,1,4,0,False
2021-01-01 00:00:00,16,,,1,585.0,True,,1,4,0,False
2021-01-01 00:00:00,19,1,603.0,1,517.0,True,,1,4,0,False
2021-01-01 00:00:00,21,1,194.0,,,True,,1,4,0,False
...,...,...,...,...,...,...,...,...,...,...,...
2024-01-01 02:00:00,144,,,1,7784.0,True,,1,0,2,False
2024-01-01 02:00:00,598,,,1,10543.0,True,,1,0,2,False
2024-01-01 06:00:00,282,,,1,32321.0,True,,1,0,6,False
2024-01-01 12:00:00,621,,,2,3490.5,True,,1,0,12,False


## Weather Data

In [17]:
# weather
def merge_hourly_weather_data(df, weather_data):
    return pd.merge(
        df,
        weather_data[['relativehumidity_2m', 'temperature_2m', 'wmo_code', 'windspeed_10m', 'temperature_apparent']],
        left_on='interval',
        right_index=True,
        how='left')

hourly = merge_hourly_weather_data(hourly, weather_data)
hourly_all_stations = merge_hourly_weather_data(hourly_all_stations, weather_data)
# hourly_all_stations.head()

# Cleaning hourly per station

In [18]:
hourly.isna().sum()

interval                         0
station_id                       0
cnt_out                    1473517
ride_duration_out          1473517
cnt_in                     1517828
ride_duration_in           1517828
is_holiday                       0
time_since_last_holiday         32
month                            0
day_of_week                      0
time_of_day                      0
is_weekend                       0
relativehumidity_2m             19
temperature_2m                  19
wmo_code                        19
windspeed_10m                   19
temperature_apparent            19
dtype: int64

In [19]:
hourly.cnt_in = hourly.cnt_in.fillna(0)
hourly.cnt_out = hourly.cnt_out.fillna(0)
hourly.time_since_last_holiday = hourly.time_since_last_holiday.fillna(0)

  hourly.time_since_last_holiday = hourly.time_since_last_holiday.fillna(0)


In [20]:
hourly[hourly.relativehumidity_2m.isna() | hourly.temperature_2m.isna()]

Unnamed: 0,interval,station_id,cnt_out,ride_duration_out,cnt_in,ride_duration_in,is_holiday,time_since_last_holiday,month,day_of_week,time_of_day,is_weekend,relativehumidity_2m,temperature_2m,wmo_code,windspeed_10m,temperature_apparent
115964,2021-02-19 02:00:00,57,0,,1,512.0,False,-5880.0,2,4,2,False,,,,,
1519016,2022-01-04 02:00:00,98,0,,1,8447.0,False,-5880.0,1,1,2,False,,,,,
5330564,2024-01-01 00:00:00,85,0,,1,960.0,True,-10080.0,1,0,0,False,,,,,
5330565,2024-01-01 00:00:00,89,0,,4,2103.5,True,-10080.0,1,0,0,False,,,,,
5330566,2024-01-01 00:00:00,91,0,,1,1712.0,True,-10080.0,1,0,0,False,,,,,
5330567,2024-01-01 00:00:00,109,0,,1,1321.0,True,-10080.0,1,0,0,False,,,,,
5330568,2024-01-01 00:00:00,152,0,,1,1673.0,True,-10080.0,1,0,0,False,,,,,
5330569,2024-01-01 00:00:00,215,0,,1,2563.0,True,-10080.0,1,0,0,False,,,,,
5330570,2024-01-01 00:00:00,410,0,,1,1063.0,True,-10080.0,1,0,0,False,,,,,
5330571,2024-01-01 00:00:00,440,0,,6,1965.666667,True,-10080.0,1,0,0,False,,,,,


In [21]:
# drop above entries, not relevant
hourly = hourly.dropna(subset=['relativehumidity_2m', 'temperature_2m', 'wmo_code', 'windspeed_10m', 'temperature_apparent'])

# Cleaning hourly all stations

In [22]:
hourly_all_stations.isna().sum()

interval                    0
cnt_out                    16
ride_duration_out          63
cnt_in                      0
ride_duration_in           54
is_holiday                  0
time_since_last_holiday     1
month                       0
day_of_week                 0
time_of_day                 0
is_weekend                  0
relativehumidity_2m        27
temperature_2m             27
wmo_code                   27
windspeed_10m              27
temperature_apparent       27
dtype: int64

In [23]:
hourly_all_stations.cnt_in = hourly_all_stations.cnt_in.fillna(0)
hourly_all_stations.cnt_out = hourly_all_stations.cnt_out.fillna(0)
hourly_all_stations.time_since_last_holiday = hourly_all_stations.time_since_last_holiday.fillna(0)

  hourly_all_stations.time_since_last_holiday = hourly_all_stations.time_since_last_holiday.fillna(0)


In [24]:
hourly_all_stations = hourly_all_stations.dropna(subset=['relativehumidity_2m', 'temperature_2m', 'wmo_code', 'windspeed_10m', 'temperature_apparent'])
# hourly_all_stations = hourly_all_stations.set_index('interval')

In [25]:
hourly_all_stations = hourly_all_stations.set_index(pd.to_datetime(hourly_all_stations.index).date)

In [26]:
hourly_all_stations = hourly_all_stations.reset_index()

# Export

In [27]:
%store hourly hourly_all_stations

Stored 'hourly' (DataFrame)
Stored 'hourly_all_stations' (DataFrame)
