In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
import pickle

from utilities import fix_date

In [2]:
noaa_df = pickle.load( open( "weather_data.pkl", "rb" ) )

### Fix the dates to match other dataframe

In [4]:
df, dates = fix_date(noaa_df)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49814 entries, 0 to 60793
Columns: 125 entries, index to WindEquipmentChangeDate
dtypes: datetime64[ns](1), float64(67), int64(2), object(55)
memory usage: 47.9+ MB


In [11]:
len(dates)

49750

In [13]:
type(dates[10])

pandas._libs.tslibs.timestamps.Timestamp

In [16]:
type(df.DATE.loc[100])

pandas._libs.tslibs.timestamps.Timestamp

In [17]:
len(df)

49814

In [23]:
df.DATE

0       2015-07-01 00:53:00
1       2015-07-01 01:53:00
2       2015-07-01 02:53:00
3       2015-07-01 03:53:00
4       2015-07-01 04:53:00
                ...        
60789   2021-03-06 17:53:00
60790   2021-03-06 18:53:00
60791   2021-03-06 19:53:00
60792   2021-03-06 20:53:00
60793   2021-03-06 21:53:00
Name: DATE, Length: 49814, dtype: datetime64[ns]

### Test shortdate approach to identify days with more than 24 recordings

**Shortdate approach: slice date, count number of recordings per "shortdate," filter out those without exactly 24 recordings to identify dates with exactly one per hour**

In [29]:
str(df.DATE.loc[100])[:10]

'2015-07-05'

In [30]:
df['SHORTDATE'] = df.DATE.apply(lambda x: str(x)[:10])

In [31]:
df["RECORDING_COUNT"] = df.groupby("SHORTDATE")["SOURCE"].transform('count')

In [32]:
test_df = df[df.RECORDING_COUNT == 24]

In [33]:
test_df.shape

(47784, 127)

In [34]:
test_df.head()

Unnamed: 0,index,STATION,DATE,REPORT_TYPE,SOURCE,AWND,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,...,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,Sunrise,Sunset,TStorms,WindEquipmentChangeDate,SHORTDATE,RECORDING_COUNT
0,0,72793524234,2015-07-01 00:53:00,FM-15,7,,,,,,...,,,,,,,,2007-05-17,2015-07-01,24
1,1,72793524234,2015-07-01 01:53:00,FM-15,7,,,,,,...,,,,,,,,2007-05-17,2015-07-01,24
2,2,72793524234,2015-07-01 02:53:00,FM-15,7,,,,,,...,,,,,,,,2007-05-17,2015-07-01,24
3,3,72793524234,2015-07-01 03:53:00,FM-15,7,,,,,,...,,,,,,,,2007-05-17,2015-07-01,24
4,4,72793524234,2015-07-01 04:53:00,FM-15,7,,,,,,...,,,,,,,,2007-05-17,2015-07-01,24


### electricity demand df: test shortdate approach to identify days with more than 24 recordings

In [36]:
e_df = pickle.load( open( "electricity_data.pkl", "rb" ) )

In [37]:
e_df.shape

(49720, 1)

In [38]:
e_df.head()

Unnamed: 0_level_0,DEMAND
DATE,Unnamed: 1_level_1
2015-07-01 08:00:00+00:00,873.0
2015-07-01 09:00:00+00:00,833.0
2015-07-01 10:00:00+00:00,802.0
2015-07-01 11:00:00+00:00,796.0
2015-07-01 12:00:00+00:00,807.0


In [40]:
e_df.reset_index(inplace=True)

In [41]:
e_df['SHORTDATE'] = e_df.DATE.apply(lambda x: str(x)[:10])

In [42]:
e_df["RECORDING_COUNT"] = e_df.groupby("SHORTDATE")["DEMAND"].transform('count')

In [43]:
test_edf = e_df[e_df.RECORDING_COUNT == 24]

In [44]:
test_edf.shape

(49440, 4)

In [45]:
test_edf

Unnamed: 0,DATE,DEMAND,SHORTDATE,RECORDING_COUNT
16,2015-07-02 00:00:00+00:00,1282.0,2015-07-02,24
17,2015-07-02 01:00:00+00:00,1277.0,2015-07-02,24
18,2015-07-02 02:00:00+00:00,1226.0,2015-07-02,24
19,2015-07-02 03:00:00+00:00,1186.0,2015-07-02,24
20,2015-07-02 04:00:00+00:00,1157.0,2015-07-02,24
...,...,...,...,...
49703,2021-03-05 19:00:00+00:00,1276.0,2021-03-05,24
49704,2021-03-05 20:00:00+00:00,1291.0,2021-03-05,24
49705,2021-03-05 21:00:00+00:00,1279.0,2021-03-05,24
49706,2021-03-05 22:00:00+00:00,1246.0,2021-03-05,24


### More values in edf than wdf. Must match edf on unique shortdates from wdf

In [48]:
match = test_df.SHORTDATE.unique().tolist()

In [50]:
test_df.shape

(47784, 127)

In [51]:
test_df.head()

Unnamed: 0,index,STATION,DATE,REPORT_TYPE,SOURCE,AWND,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,...,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,Sunrise,Sunset,TStorms,WindEquipmentChangeDate,SHORTDATE,RECORDING_COUNT
0,0,72793524234,2015-07-01 00:53:00,FM-15,7,,,,,,...,,,,,,,,2007-05-17,2015-07-01,24
1,1,72793524234,2015-07-01 01:53:00,FM-15,7,,,,,,...,,,,,,,,2007-05-17,2015-07-01,24
2,2,72793524234,2015-07-01 02:53:00,FM-15,7,,,,,,...,,,,,,,,2007-05-17,2015-07-01,24
3,3,72793524234,2015-07-01 03:53:00,FM-15,7,,,,,,...,,,,,,,,2007-05-17,2015-07-01,24
4,4,72793524234,2015-07-01 04:53:00,FM-15,7,,,,,,...,,,,,,,,2007-05-17,2015-07-01,24


In [49]:
test_edf[test_edf.SHORTDATE.isin(match)]

Unnamed: 0,DATE,DEMAND,SHORTDATE,RECORDING_COUNT
16,2015-07-02 00:00:00+00:00,1282.0,2015-07-02,24
17,2015-07-02 01:00:00+00:00,1277.0,2015-07-02,24
18,2015-07-02 02:00:00+00:00,1226.0,2015-07-02,24
19,2015-07-02 03:00:00+00:00,1186.0,2015-07-02,24
20,2015-07-02 04:00:00+00:00,1157.0,2015-07-02,24
...,...,...,...,...
49703,2021-03-05 19:00:00+00:00,1276.0,2021-03-05,24
49704,2021-03-05 20:00:00+00:00,1291.0,2021-03-05,24
49705,2021-03-05 21:00:00+00:00,1279.0,2021-03-05,24
49706,2021-03-05 22:00:00+00:00,1246.0,2021-03-05,24


In [52]:
test_df.shape

(47784, 127)

In [53]:
test_wdf = test_df.copy()

In [54]:
test_edf.shape

(49440, 4)

In [55]:
test_edf.head()

Unnamed: 0,DATE,DEMAND,SHORTDATE,RECORDING_COUNT
16,2015-07-02 00:00:00+00:00,1282.0,2015-07-02,24
17,2015-07-02 01:00:00+00:00,1277.0,2015-07-02,24
18,2015-07-02 02:00:00+00:00,1226.0,2015-07-02,24
19,2015-07-02 03:00:00+00:00,1186.0,2015-07-02,24
20,2015-07-02 04:00:00+00:00,1157.0,2015-07-02,24


In [56]:
test_edf[test_edf.RECORDING_COUNT != 24]

Unnamed: 0,DATE,DEMAND,SHORTDATE,RECORDING_COUNT


In [60]:
test_edf.shape

(49440, 4)

In [57]:
test_wdf.shape

(47784, 127)

In [59]:
test_wdf[test_wdf.RECORDING_COUNT != 24]

Unnamed: 0,index,STATION,DATE,REPORT_TYPE,SOURCE,AWND,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,...,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,Sunrise,Sunset,TStorms,WindEquipmentChangeDate,SHORTDATE,RECORDING_COUNT


In [64]:
edf = test_edf[test_edf.SHORTDATE.isin(match)]

In [65]:
match = edf.SHORTDATE.unique().tolist()

In [66]:
wdf = test_wdf[test_wdf.SHORTDATE.isin(match)]

In [67]:
edf.shape

(47424, 4)

In [68]:
wdf.shape

(47424, 127)

### Now that both weather and electricity dataframes have exactly one recording per hour and 24 recordings per day, we can combine the dataframes into our final df

In [70]:
wdf['DEMAND'] = edf.DEMAND

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wdf['DEMAND'] = edf.DEMAND


In [74]:
wdf.drop(columns=['index', 'DEMAND'], inplace=True)

KeyError: "['index' 'DEMAND'] not found in axis"

In [77]:
wdf.reset_index(inplace=True, drop=True)

In [79]:
edf.reset_index(inplace=True, drop=True)

In [81]:
wdf.shape[0] == edf.shape[0]

True

In [82]:
wdf

Unnamed: 0,STATION,DATE,REPORT_TYPE,SOURCE,AWND,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,BackupElevation,...,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,Sunrise,Sunset,TStorms,WindEquipmentChangeDate,SHORTDATE,RECORDING_COUNT
0,72793524234,2015-07-02 00:53:00,FM-15,7,,,,,,,...,,,,,,,,2007-05-17,2015-07-02,24
1,72793524234,2015-07-02 01:53:00,FM-15,7,,,,,,,...,,,,,,,,2007-05-17,2015-07-02,24
2,72793524234,2015-07-02 02:53:00,FM-15,7,,,,,,,...,,,,,,,,2007-05-17,2015-07-02,24
3,72793524234,2015-07-02 03:53:00,FM-15,7,,,,,,,...,,,,,,,,2007-05-17,2015-07-02,24
4,72793524234,2015-07-02 04:53:00,FM-15,7,,,,,,,...,,,,,,,,2007-05-17,2015-07-02,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47419,72793524234,2021-03-05 19:53:00,FM-15,7,,,,,,,...,,,,,,,,2007-05-17,2021-03-05,24
47420,72793524234,2021-03-05 20:53:00,FM-15,7,,,,,,,...,,,,,,,,2007-05-17,2021-03-05,24
47421,72793524234,2021-03-05 21:53:00,FM-15,7,,,,,,,...,,,,,,,,2007-05-17,2021-03-05,24
47422,72793524234,2021-03-05 22:53:00,FM-15,7,,,,,,,...,,,,,,,,2007-05-17,2021-03-05,24


In [84]:
wdf['DEMAND'] = edf.DEMAND.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wdf['DEMAND'] = edf.DEMAND.copy()


In [85]:
wdf

Unnamed: 0,STATION,DATE,REPORT_TYPE,SOURCE,AWND,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,BackupElevation,...,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,Sunrise,Sunset,TStorms,WindEquipmentChangeDate,SHORTDATE,RECORDING_COUNT,DEMAND
0,72793524234,2015-07-02 00:53:00,FM-15,7,,,,,,,...,,,,,,,2007-05-17,2015-07-02,24,1282.0
1,72793524234,2015-07-02 01:53:00,FM-15,7,,,,,,,...,,,,,,,2007-05-17,2015-07-02,24,1277.0
2,72793524234,2015-07-02 02:53:00,FM-15,7,,,,,,,...,,,,,,,2007-05-17,2015-07-02,24,1226.0
3,72793524234,2015-07-02 03:53:00,FM-15,7,,,,,,,...,,,,,,,2007-05-17,2015-07-02,24,1186.0
4,72793524234,2015-07-02 04:53:00,FM-15,7,,,,,,,...,,,,,,,2007-05-17,2015-07-02,24,1157.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47419,72793524234,2021-03-05 19:53:00,FM-15,7,,,,,,,...,,,,,,,2007-05-17,2021-03-05,24,1276.0
47420,72793524234,2021-03-05 20:53:00,FM-15,7,,,,,,,...,,,,,,,2007-05-17,2021-03-05,24,1291.0
47421,72793524234,2021-03-05 21:53:00,FM-15,7,,,,,,,...,,,,,,,2007-05-17,2021-03-05,24,1279.0
47422,72793524234,2021-03-05 22:53:00,FM-15,7,,,,,,,...,,,,,,,2007-05-17,2021-03-05,24,1246.0


In [86]:
with open('seattle_data.pkl', 'wb') as picklefile:
    pickle.dump(wdf, picklefile)