# Luftdaten data : data cleaning, resampling - mini version
## Code builds a continuous time tabular version of the luftdaen data, such that the same time period is present for each sensor in the data, regardless of whether each sensor has data for all the time slots. 

## Testing :
- using pd.resample
- constructing a time shift using pandas own tools, rather than my own


#### Reference documents

Resampling time series data with Pandas ( Ben Alex Keen ) 
http://benalexkeen.com/resampling-time-series-data-with-pandas/

In [76]:
import pandas as pd
import numpy as np
import time

In [77]:
# parameters

# start_time = "2018-12-31 21:58:42"
end_time = "2019-01-01 11:58:42"
# generate this please
start_time = "?????"

time_frequency_for_periods__for_basic_data = "5T"
num_of_time_periods___for_basic_data = 24*20 # 24 hrs * 12 x 5 mins in each hour

# when generating time periods 
sampling_frequency = "3T"



# --- data urls 

curr_url = "????"
nordic_midnight_24_hrs_data__url = "/Users/miska/Documents/open_something/luftdaten/luftdaten_code/luftdaten__make_tabular_data__from_db_data/ld_NYE_midnight_24hrs_nordics_all_data_01.csv"

# set the current data source 
curr_url =nordic_midnight_24_hrs_data__url

In [78]:
# try convert the timestamp in the data, to epoch

in_data = pd.read_csv( curr_url )
in_data.shape

(127109, 7)

In [79]:
in_data.dtypes

sensor_id         int64
sensor_namee     object
lat             float64
lon             float64
timestamp        object
p1              float64
p2              float64
dtype: object

In [80]:
in_data['timestamp'] = pd.to_datetime( in_data['timestamp'] )

In [81]:
in_data.dtypes

sensor_id                int64
sensor_namee            object
lat                    float64
lon                    float64
timestamp       datetime64[ns]
p1                     float64
p2                     float64
dtype: object

In [82]:
in_data.head()

Unnamed: 0,sensor_id,sensor_namee,lat,lon,timestamp,p1,p2
0,7273,SDS011,60.002,17.846,2018-12-31 11:57:22,3.43,1.56
1,7275,SDS011,57.72,11.888,2018-12-31 11:58:44,482.77,33.82
2,7277,SDS011,59.266,15.23,2018-12-31 11:58:47,5.48,2.47
3,7406,SDS011,56.964,24.128,2018-12-31 11:56:41,11.05,6.62
4,7428,SDS011,59.868,17.624,2018-12-31 11:57:42,1.78,1.02


In [83]:
in_data.tail()

Unnamed: 0,sensor_id,sensor_namee,lat,lon,timestamp,p1,p2
127104,16723,SDS011,57.736,11.894,2019-01-01 11:58:57,16.47,3.4
127105,16815,SDS011,59.462,18.04,2019-01-01 11:59:36,2.67,1.97
127106,17235,SDS011,59.272,17.78,2019-01-01 11:59:41,4.69,1.82
127107,10588,SDS011,55.676,13.346,2019-01-01 11:57:12,14.08,3.68
127108,10647,SDS011,55.608,13.036,2019-01-01 11:58:42,23.42,4.75


In [84]:
in_data__with_epoch_times = in_data.copy()

In [85]:
in_data__with_epoch_times['timestamp'] = in_data['timestamp'].astype( 'int64' )

In [86]:
in_data__with_epoch_times.head()

Unnamed: 0,sensor_id,sensor_namee,lat,lon,timestamp,p1,p2
0,7273,SDS011,60.002,17.846,1546257442000000000,3.43,1.56
1,7275,SDS011,57.72,11.888,1546257524000000000,482.77,33.82
2,7277,SDS011,59.266,15.23,1546257527000000000,5.48,2.47
3,7406,SDS011,56.964,24.128,1546257401000000000,11.05,6.62
4,7428,SDS011,59.868,17.624,1546257462000000000,1.78,1.02


In [87]:
in_data__with_epoch_times.tail()


Unnamed: 0,sensor_id,sensor_namee,lat,lon,timestamp,p1,p2
127104,16723,SDS011,57.736,11.894,1546343937000000000,16.47,3.4
127105,16815,SDS011,59.462,18.04,1546343976000000000,2.67,1.97
127106,17235,SDS011,59.272,17.78,1546343981000000000,4.69,1.82
127107,10588,SDS011,55.676,13.346,1546343832000000000,14.08,3.68
127108,10647,SDS011,55.608,13.036,1546343922000000000,23.42,4.75


In [88]:
# test access to timestamps 
in_data__with_epoch_times__ONLY_TIMESTAMPS = in_data__with_epoch_times['timestamp']
in_data__with_epoch_times__ONLY_TIMESTAMPS.shape

(127109,)

In [89]:
type( in_data__with_epoch_times__ONLY_TIMESTAMPS  )

pandas.core.series.Series

In [90]:
# testing how to access and element. 
# - seems a regular [] acess is fine 
in_data__with_epoch_times__ONLY_TIMESTAMPS[0]

1546257442000000000

In [91]:
### various time operations

In [92]:
end_time

'2019-01-01 11:58:42'

## Doing own time-series construction 

### : extending time-series arrays by cutting-pasting time arrays. 

In [93]:
test_time_series = in_data
test_time_series.shape

(127109, 7)

In [94]:
data_from_start = test_time_series[:10]
data_from_start

Unnamed: 0,sensor_id,sensor_namee,lat,lon,timestamp,p1,p2
0,7273,SDS011,60.002,17.846,2018-12-31 11:57:22,3.43,1.56
1,7275,SDS011,57.72,11.888,2018-12-31 11:58:44,482.77,33.82
2,7277,SDS011,59.266,15.23,2018-12-31 11:58:47,5.48,2.47
3,7406,SDS011,56.964,24.128,2018-12-31 11:56:41,11.05,6.62
4,7428,SDS011,59.868,17.624,2018-12-31 11:57:42,1.78,1.02
5,7469,SDS011,56.944,24.142,2018-12-31 11:57:52,8.4,5.54
6,7597,SDS011,59.32,18.064,2018-12-31 11:58:51,3.68,2.0
7,8683,SDS011,59.744,18.206,2018-12-31 11:58:28,3.01,2.8
8,9411,SDS011,59.266,15.23,2018-12-31 11:57:18,3.44,2.18
9,9436,SDS011,59.334,18.034,2018-12-31 11:57:22,2.12,1.9


In [95]:
data_from_middle = test_time_series[60000:60010]
data_from_middle.head()

Unnamed: 0,sensor_id,sensor_namee,lat,lon,timestamp,p1,p2
60000,12679,SDS011,59.384,17.874,2018-12-31 16:52:43,0.4,0.4
60001,12687,SDS011,59.388,17.798,2018-12-31 16:52:21,2.83,1.9
60002,12691,SDS011,57.636,18.304,2018-12-31 16:51:17,2.4,1.7
60003,12693,SDS011,58.19,12.72,2018-12-31 16:51:03,6.42,2.62
60004,14017,SDS011,59.376,18.01,2018-12-31 16:51:10,4.07,1.94


In [96]:
data_from_end = test_time_series[-10:]
data_from_end

Unnamed: 0,sensor_id,sensor_namee,lat,lon,timestamp,p1,p2
127099,13020,SDS011,57.722,11.948,2019-01-01 11:59:41,20.64,3.67
127100,16147,SDS011,59.364,18.018,2019-01-01 11:59:46,3.28,1.8
127101,16153,SDS011,55.648,13.208,2019-01-01 11:57:19,20.0,3.9
127102,16296,SDS011,56.144,13.394,2019-01-01 11:59:59,23.86,7.45
127103,16533,SDS011,55.722,13.202,2019-01-01 11:56:55,18.05,4.33
127104,16723,SDS011,57.736,11.894,2019-01-01 11:58:57,16.47,3.4
127105,16815,SDS011,59.462,18.04,2019-01-01 11:59:36,2.67,1.97
127106,17235,SDS011,59.272,17.78,2019-01-01 11:59:41,4.69,1.82
127107,10588,SDS011,55.676,13.346,2019-01-01 11:57:12,14.08,3.68
127108,10647,SDS011,55.608,13.036,2019-01-01 11:58:42,23.42,4.75


### let's try make an array, where we extend the time  by adding earlier and later times

In [97]:
middle_array_with_earlier_time = data_from_start.append( data_from_middle )

In [98]:
middle_array_with_earlier_time

Unnamed: 0,sensor_id,sensor_namee,lat,lon,timestamp,p1,p2
0,7273,SDS011,60.002,17.846,2018-12-31 11:57:22,3.43,1.56
1,7275,SDS011,57.72,11.888,2018-12-31 11:58:44,482.77,33.82
2,7277,SDS011,59.266,15.23,2018-12-31 11:58:47,5.48,2.47
3,7406,SDS011,56.964,24.128,2018-12-31 11:56:41,11.05,6.62
4,7428,SDS011,59.868,17.624,2018-12-31 11:57:42,1.78,1.02
5,7469,SDS011,56.944,24.142,2018-12-31 11:57:52,8.4,5.54
6,7597,SDS011,59.32,18.064,2018-12-31 11:58:51,3.68,2.0
7,8683,SDS011,59.744,18.206,2018-12-31 11:58:28,3.01,2.8
8,9411,SDS011,59.266,15.23,2018-12-31 11:57:18,3.44,2.18
9,9436,SDS011,59.334,18.034,2018-12-31 11:57:22,2.12,1.9


In [99]:
middle_array_with_earlier_and_later_added_times = middle_array_with_earlier_time.append( data_from_end )
middle_array_with_earlier_and_later_added_times

Unnamed: 0,sensor_id,sensor_namee,lat,lon,timestamp,p1,p2
0,7273,SDS011,60.002,17.846,2018-12-31 11:57:22,3.43,1.56
1,7275,SDS011,57.72,11.888,2018-12-31 11:58:44,482.77,33.82
2,7277,SDS011,59.266,15.23,2018-12-31 11:58:47,5.48,2.47
3,7406,SDS011,56.964,24.128,2018-12-31 11:56:41,11.05,6.62
4,7428,SDS011,59.868,17.624,2018-12-31 11:57:42,1.78,1.02
5,7469,SDS011,56.944,24.142,2018-12-31 11:57:52,8.4,5.54
6,7597,SDS011,59.32,18.064,2018-12-31 11:58:51,3.68,2.0
7,8683,SDS011,59.744,18.206,2018-12-31 11:58:28,3.01,2.8
8,9411,SDS011,59.266,15.23,2018-12-31 11:57:18,3.44,2.18
9,9436,SDS011,59.334,18.034,2018-12-31 11:57:22,2.12,1.9


In [100]:
# now resample the data, such that the time values in between are filled in

In [101]:
middle_array_with_earlier_and_later_added_times['timestamp'] = pd.to_datetime( middle_array_with_earlier_and_later_added_times['timestamp'] )

In [104]:
middle_array_with_earlier_and_later_added_times = middle_array_with_earlier_and_later_added_times.set_index('timestamp')

In [105]:
middle_array_with_earlier_and_later_added_times.index

DatetimeIndex(['2018-12-31 11:57:22', '2018-12-31 11:58:44',
               '2018-12-31 11:58:47', '2018-12-31 11:56:41',
               '2018-12-31 11:57:42', '2018-12-31 11:57:52',
               '2018-12-31 11:58:51', '2018-12-31 11:58:28',
               '2018-12-31 11:57:18', '2018-12-31 11:57:22',
               '2018-12-31 16:52:43', '2018-12-31 16:52:21',
               '2018-12-31 16:51:17', '2018-12-31 16:51:03',
               '2018-12-31 16:51:10', '2018-12-31 16:50:49',
               '2018-12-31 16:51:35', '2018-12-31 16:51:51',
               '2018-12-31 16:50:46', '2018-12-31 16:51:31',
               '2019-01-01 11:59:41', '2019-01-01 11:59:46',
               '2019-01-01 11:57:19', '2019-01-01 11:59:59',
               '2019-01-01 11:56:55', '2019-01-01 11:58:57',
               '2019-01-01 11:59:36', '2019-01-01 11:59:41',
               '2019-01-01 11:57:12', '2019-01-01 11:58:42'],
              dtype='datetime64[ns]', name='timestamp', freq=None)

In [106]:
middle_array_with_earlier_and_later_added_times__RESAMPLED_to_fill_blank_time_intervals = middle_array_with_earlier_and_later_added_times.resample("5Min").mean().bfill()

In [107]:
middle_array_with_earlier_and_later_added_times__RESAMPLED_to_fill_blank_time_intervals

Unnamed: 0_level_0,sensor_id,lat,lon,p1,p2
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-12-31 11:55:00,7925.5,58.8428,18.0392,52.516,5.991
2018-12-31 12:00:00,13592.7,58.3782,15.7462,15.282,5.534
2018-12-31 12:05:00,13592.7,58.3782,15.7462,15.282,5.534
2018-12-31 12:10:00,13592.7,58.3782,15.7462,15.282,5.534
2018-12-31 12:15:00,13592.7,58.3782,15.7462,15.282,5.534
2018-12-31 12:20:00,13592.7,58.3782,15.7462,15.282,5.534
2018-12-31 12:25:00,13592.7,58.3782,15.7462,15.282,5.534
2018-12-31 12:30:00,13592.7,58.3782,15.7462,15.282,5.534
2018-12-31 12:35:00,13592.7,58.3782,15.7462,15.282,5.534
2018-12-31 12:40:00,13592.7,58.3782,15.7462,15.282,5.534


In [50]:
# SIDE test : let's try doing the resampling on one column only 

In [51]:
middle_array_with_earlier_and_later_added_times

Unnamed: 0,sensor_id,sensor_namee,lat,lon,timestamp,p1,p2
0,7273,SDS011,60.002,17.846,2018-12-31 11:57:22,3.43,1.56
1,7275,SDS011,57.72,11.888,2018-12-31 11:58:44,482.77,33.82
2,7277,SDS011,59.266,15.23,2018-12-31 11:58:47,5.48,2.47
3,7406,SDS011,56.964,24.128,2018-12-31 11:56:41,11.05,6.62
4,7428,SDS011,59.868,17.624,2018-12-31 11:57:42,1.78,1.02
5,7469,SDS011,56.944,24.142,2018-12-31 11:57:52,8.4,5.54
6,7597,SDS011,59.32,18.064,2018-12-31 11:58:51,3.68,2.0
7,8683,SDS011,59.744,18.206,2018-12-31 11:58:28,3.01,2.8
8,9411,SDS011,59.266,15.23,2018-12-31 11:57:18,3.44,2.18
9,9436,SDS011,59.334,18.034,2018-12-31 11:57:22,2.12,1.9


In [52]:
middle_array_with_earlier_and_later_added_times__P1_ONLY = middle_array_with_earlier_and_later_added_times[ [ 'timestamp', 'p1'] ]
middle_array_with_earlier_and_later_added_times__P1_ONLY

Unnamed: 0,timestamp,p1
0,2018-12-31 11:57:22,3.43
1,2018-12-31 11:58:44,482.77
2,2018-12-31 11:58:47,5.48
3,2018-12-31 11:56:41,11.05
4,2018-12-31 11:57:42,1.78
5,2018-12-31 11:57:52,8.4
6,2018-12-31 11:58:51,3.68
7,2018-12-31 11:58:28,3.01
8,2018-12-31 11:57:18,3.44
9,2018-12-31 11:57:22,2.12


In [53]:
middle_array_with_earlier_and_later_added_times__P1_ONLY['timestamp'] = pd.to_datetime( middle_array_with_earlier_and_later_added_times__P1_ONLY['timestamp'] )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [54]:
middle_array_with_earlier_and_later_added_times__P1_ONLY.set_index('timestamp')

Unnamed: 0_level_0,p1
timestamp,Unnamed: 1_level_1
2018-12-31 11:57:22,3.43
2018-12-31 11:58:44,482.77
2018-12-31 11:58:47,5.48
2018-12-31 11:56:41,11.05
2018-12-31 11:57:42,1.78
2018-12-31 11:57:52,8.4
2018-12-31 11:58:51,3.68
2018-12-31 11:58:28,3.01
2018-12-31 11:57:18,3.44
2018-12-31 11:57:22,2.12


In [55]:
middle_array_with_earlier_and_later_added_times__P1_ONLY.resample("5Min")

TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Int64Index'

In [56]:
middle_array_with_earlier_and_later_added_times__P1_ONLY = middle_array_with_earlier_and_later_added_times__P1_ONLY.set_index("timestamp")

In [57]:
middle_array_with_earlier_and_later_added_times__P1_ONLY.shape

(30, 1)

In [62]:
middle_array_with_earlier_and_later_added_times__P1_ONLY__RESAMPLED  = middle_array_with_earlier_and_later_added_times__P1_ONLY.resample("5Min").mean().bfill()

In [63]:
middle_array_with_earlier_and_later_added_times__P1_ONLY__RESAMPLED.shape

(289, 1)

In [64]:
middle_array_with_earlier_and_later_added_times__P1_ONLY__RESAMPLED

Unnamed: 0_level_0,p1
timestamp,Unnamed: 1_level_1
2018-12-31 11:55:00,52.516
2018-12-31 12:00:00,15.282
2018-12-31 12:05:00,15.282
2018-12-31 12:10:00,15.282
2018-12-31 12:15:00,15.282
2018-12-31 12:20:00,15.282
2018-12-31 12:25:00,15.282
2018-12-31 12:30:00,15.282
2018-12-31 12:35:00,15.282
2018-12-31 12:40:00,15.282


### now let's RETRY working on the bigger dataframe

In [72]:
middle_array_with_earlier_and_later_added_times.dtypes

sensor_id         int64
sensor_namee     object
lat             float64
lon             float64
p1              float64
p2              float64
dtype: object

In [73]:
middle_array_with_earlier_and_later_added_times['timestamp'] = pd.to_datetime( middle_array_with_earlier_and_later_added_times['timestamp'] )

KeyError: 'timestamp'

In [74]:
middle_array_with_earlier_and_later_added_times = middle_array_with_earlier_and_later_added_times.set_index('timestamp')

KeyError: 'timestamp'

In [75]:
middle_array_with_earlier_and_later_added_times__RESAMPLED = middle_array_with_earlier_and_later_added_times.resample("5Min")