In [2]:
# Import statements

import pandas as pd
import matplotlib.pyplot as plt

# Inspection and Strategy

## 1: Load and Inspect datasets

In [3]:
electricity_df = pd.read_csv("electricity_data_germany.csv", sep = ";", low_memory = False)
weather_df = pd.read_csv("weather_data_germany.csv", sep = ",", low_memory = False)

# low_memory = False in pd.read_csv controls how Pandas handles data types when reading a large file
# Pandas defaultly infers column data types in chunks to save memory, causing mixed data types
# this forces Pandas to read the entire file first before deciding on data types

In [24]:
print(electricity_df.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 175167 entries, 2015-01-01 07:15:00+00:00 to 2019-12-30 22:45:00+00:00
Data columns (total 15 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   cet_cest_timestamp                    175167 non-null  object 
 1   DE_load_actual_entsoe_transparency    175167 non-null  object 
 2   DE_load_forecast_entsoe_transparency  175071 non-null  object 
 3   DE_solar_capacity                     175167 non-null  int64  
 4   DE_solar_generation_actual            174783 non-null  object 
 5   DE_solar_profile                      174783 non-null  float64
 6   DE_wind_capacity                      175167 non-null  int64  
 7   DE_wind_generation_actual             174869 non-null  object 
 8   DE_wind_profile                       174869 non-null  float64
 9   DE_wind_offshore_capacity             175167 non-null  int64  
 10  DE_wind_offshore_gener

Observations:
Missing values in DE_load_forecast_entsoe_transparency, DE_solar_generation_actual, and DE_wind_generation_actual

Can ignore CET timestamp, stick to UCT (similar to weather data which needs to be merged)
Forecast transparency is a row that needs to be removed - cannot be a feature! But can we kept for later (when comparing our predictions)

Starts in 2014, so make sure the overlap works

In [25]:
missing_values = electricity_df.isnull().sum()
print(missing_values[missing_values > 0])

DE_load_forecast_entsoe_transparency     96
DE_solar_generation_actual              384
DE_solar_profile                        384
DE_wind_generation_actual               298
DE_wind_profile                         298
DE_wind_offshore_generation_actual      298
DE_wind_offshore_profile                298
DE_wind_onshore_generation_actual       288
DE_wind_onshore_profile                 288
dtype: int64


In [26]:
missing_rows = electricity_df[electricity_df.isnull().any(axis = 1)]
print(missing_rows.index.tolist())

[Timestamp('2015-02-28 00:00:00+0000', tz='UTC'), Timestamp('2015-02-28 00:15:00+0000', tz='UTC'), Timestamp('2015-02-28 00:30:00+0000', tz='UTC'), Timestamp('2015-02-28 00:45:00+0000', tz='UTC'), Timestamp('2015-02-28 01:00:00+0000', tz='UTC'), Timestamp('2015-02-28 01:15:00+0000', tz='UTC'), Timestamp('2015-02-28 01:30:00+0000', tz='UTC'), Timestamp('2015-02-28 01:45:00+0000', tz='UTC'), Timestamp('2015-02-28 02:00:00+0000', tz='UTC'), Timestamp('2015-02-28 02:15:00+0000', tz='UTC'), Timestamp('2015-02-28 02:30:00+0000', tz='UTC'), Timestamp('2015-02-28 02:45:00+0000', tz='UTC'), Timestamp('2015-02-28 03:00:00+0000', tz='UTC'), Timestamp('2015-02-28 03:15:00+0000', tz='UTC'), Timestamp('2015-02-28 03:30:00+0000', tz='UTC'), Timestamp('2015-02-28 03:45:00+0000', tz='UTC'), Timestamp('2015-02-28 04:00:00+0000', tz='UTC'), Timestamp('2015-02-28 04:15:00+0000', tz='UTC'), Timestamp('2015-02-28 04:30:00+0000', tz='UTC'), Timestamp('2015-02-28 04:45:00+0000', tz='UTC'), Timestamp('2015-02-

No solar generation (actual and profile) reported for the 28th of February, 2015
No solar (2x), wind (2x on shore and off shore) reported on 2016-06-01 and 2016-10-27 and 2016-12-09
No wind reported on 2016-11-03 (couple of hours)

How is the target variable behaving here? how can we impute (careful of cycles!)

No forecasted load reported on 2018-09-24 (not so important)

Target variable is available throughout consistently (yay!)

In [7]:
print(weather_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52584 entries, 0 to 52583
Data columns (total 4 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   utc_timestamp                    52584 non-null  object 
 1   DE_temperature                   52584 non-null  float64
 2   DE_radiation_direct_horizontal   52584 non-null  float64
 3   DE_radiation_diffuse_horizontal  52584 non-null  float64
dtypes: float64(3), object(1)
memory usage: 1.6+ MB
None


Appears complete with no missing values, but there are 0s between 16h and 6h every 24h cycle in the dataset for the two radiation features. How can I still use it, and get it to recognize them as non-null values? Or should I assume there is no radiation during those times (unlikely)

In [8]:
print(electricity_df.head())

          utc_timestamp        cet_cest_timestamp  \
0  2015-01-01T07:15:00Z  2015-01-01T08:15:00+0100   
1  2015-01-01T07:30:00Z  2015-01-01T08:30:00+0100   
2  2015-01-01T07:45:00Z  2015-01-01T08:45:00+0100   
3  2015-01-01T08:00:00Z  2015-01-01T09:00:00+0100   
4  2015-01-01T08:15:00Z  2015-01-01T09:15:00+0100   

  DE_load_actual_entsoe_transparency DE_load_forecast_entsoe_transparency  \
0                            40998.2                             42497.11   
1                            41120.9                             42458.48   
2                           41476.39                             43085.68   
3                            42120.4                             44127.67   
4                           42624.45                              44906.5   

   DE_solar_capacity DE_solar_generation_actual  DE_solar_profile  \
0              37248                      14.18            0.0004   
1              37248                      49.02            0.0013   
2          

In [9]:
print(weather_df.head())

          utc_timestamp  DE_temperature  DE_radiation_direct_horizontal  \
0  2014-01-01T00:00:00Z          -0.568                             0.0   
1  2014-01-01T01:00:00Z          -0.509                             0.0   
2  2014-01-01T02:00:00Z          -0.436                             0.0   
3  2014-01-01T03:00:00Z          -0.257                             0.0   
4  2014-01-01T04:00:00Z          -0.082                             0.0   

   DE_radiation_diffuse_horizontal  
0                              0.0  
1                              0.0  
2                              0.0  
3                              0.0  
4                              0.0  


## 2: Check Timestamp Formats

In [13]:
print("Electricity Time Range")
print(electricity_df['utc_timestamp'].sort_values().head())
print(electricity_df['utc_timestamp'].sort_values().tail())

print("Weather Time Range")
print(weather_df["utc_timestamp"].sort_values().head())
print(weather_df["utc_timestamp"].sort_values().tail())

# sorting just in case, so we can see the earliest and latest entries

Electricity Time Range
0   2015-01-01 07:15:00+00:00
1   2015-01-01 07:30:00+00:00
2   2015-01-01 07:45:00+00:00
3   2015-01-01 08:00:00+00:00
4   2015-01-01 08:15:00+00:00
Name: utc_timestamp, dtype: datetime64[ns, UTC]
175162   2019-12-30 21:45:00+00:00
175163   2019-12-30 22:00:00+00:00
175164   2019-12-30 22:15:00+00:00
175165   2019-12-30 22:30:00+00:00
175166   2019-12-30 22:45:00+00:00
Name: utc_timestamp, dtype: datetime64[ns, UTC]
Weather Time Range
0   2014-01-01 00:00:00+00:00
1   2014-01-01 01:00:00+00:00
2   2014-01-01 02:00:00+00:00
3   2014-01-01 03:00:00+00:00
4   2014-01-01 04:00:00+00:00
Name: utc_timestamp, dtype: datetime64[ns, UTC]
52579   2019-12-31 19:00:00+00:00
52580   2019-12-31 20:00:00+00:00
52581   2019-12-31 21:00:00+00:00
52582   2019-12-31 22:00:00+00:00
52583   2019-12-31 23:00:00+00:00
Name: utc_timestamp, dtype: datetime64[ns, UTC]


Observations:

Datasets use ISO 8601 timestamps

Coverage: electricity Jan 1 2015 to Dec 30, 2019; weather Jan 1 2014 to Dec 31, 2019 (WEATHER DATA FULLY OVERLAPS!)

In [11]:
# Conversion to Datetime Format
# This will help later, because right now they are just stored as strings. With datetime format, some additional operations can be done for merging + upsampling

electricity_df['utc_timestamp'] = pd.to_datetime(electricity_df['utc_timestamp'], errors = "coerce") # coercing errors safely converts invalid formats to NaT (not a time) for future catching
weather_df['utc_timestamp'] = pd.to_datetime(weather_df['utc_timestamp'], errors = "coerce")

print("Checking types")
print(electricity_df['utc_timestamp'].dtype)
print(weather_df['utc_timestamp'].dtype)
print()

print("Checking Missing Timestamps")
print(electricity_df['utc_timestamp'].isna().sum())
print(weather_df['utc_timestamp'].isna().sum())

Checking types
datetime64[ns, UTC]
datetime64[ns, UTC]

Checking Missing Timestamps
0
0


Proper format and no failed conversion! Can now to resampling, merging and feature engineering

## 3: Missing Values

In [12]:
# Investigating Radiation Zeros

# Count and percentage of zeros in radiation columns
for col in ["DE_radiation_direct_horizontal", "DE_radiation_diffuse_horizontal"]:
    zero_count = (weather_df[col] == 0).sum()
    total_count = len(weather_df)
    percent = zero_count / total_count * 100
    print(f"{col}: {zero_count} zeros ({percent:.2f}%)")

print()
# When zeros occur:
weather_df['hour'] = weather_df['utc_timestamp'].dt.hour

for col in ['DE_radiation_direct_horizontal', 'DE_radiation_diffuse_horizontal']:
    print(f"\nZero Distribution by hour for {col}:")
    zero_hours = weather_df[weather_df[col] == 0]['hour'].value_counts().sort_index()
    print(zero_hours)

DE_radiation_direct_horizontal: 22850 zeros (43.45%)
DE_radiation_diffuse_horizontal: 22497 zeros (42.78%)


Zero Distribution by hour for DE_radiation_direct_horizontal:
hour
0     2191
1     2191
2     1920
3     1471
4     1108
5      742
6      253
16     459
17     874
18    1243
19    1634
20    2191
21    2191
22    2191
23    2191
Name: count, dtype: int64

Zero Distribution by hour for DE_radiation_diffuse_horizontal:
hour
0     2191
1     2191
2     1851
3     1442
4     1085
5      709
6      170
16     413
17     850
18    1225
19    1606
20    2191
21    2191
22    2191
23    2191
Name: count, dtype: int64


Basically this means:

About 43% of rows are zero in both radiation columns, which is about 12 hours of darkness per day (on average)

Radiation is consistently 0 from 20:00 to 6:00, and the transition periods have a tapering pattern (sunrise/sunset). Midday hours (7:00 to 17:00) almost never contain zeros

Therefore:

Don't treat 0s as missing

Add a binary is_daylight column (therefore when radiation is 0, is because it's night, not cloudy, and can help learn conditional patterns )

In [14]:
# Adding a binary column

weather_df['is_daylight'] = weather_df['hour'].between(6, 19).astype(int)

weather_df[100:105]


Unnamed: 0,utc_timestamp,DE_temperature,DE_radiation_direct_horizontal,DE_radiation_diffuse_horizontal,hour,is_daylight
100,2014-01-05 04:00:00+00:00,2.92,0.0,0.0,4,0
101,2014-01-05 05:00:00+00:00,2.954,0.0,0.0,5,0
102,2014-01-05 06:00:00+00:00,2.965,0.0,0.0,6,1
103,2014-01-05 07:00:00+00:00,2.983,0.078,3.9846,7,1
104,2014-01-05 08:00:00+00:00,3.355,0.8933,29.2558,8,1


IMPORTANT FOR FUTURE! WHEN NORMALIZING, ONLY NORMALIZE IF IS_DAYLIGHT IS 1, THIS WAY IT WON'T GET DISTRACTED BY TOO MANY ZEROS!

In [None]:
# Dealing with missing values strategy (for the future! can do it here)

# option 1: drop rows with missing values

#electricity_cleaned = electricity_df.dropna()
# Easy, no imputation, but could drop rare time windows

# option 2: forward fill, using most revent previous value, using time-series logic

#electricity_filled = electricity_df.fillna(method = 'ffill')
# time-consistent, less usesful for daily cycles (solar)

# option 3: feature-wise interpolation or median fill

#electricity_median = electricity_df.fillna(electricity_df.median(numeric_only = True))
#electricity_interp = electricity_df.interpolate(method = 'time')

Preprocessing logic below: change to python and run if using: (but rethink with the hint below, before executing)

Reminder: the missing values in question are solar & wind. These might need to be handled separately, because one is daily cycles (solar).

## 4: Outlier & Distribution Analysis

In [17]:
# Summary Statistics

display(electricity_df.describe())
display(weather_df.describe())

display(electricity_df.quantile([0.01, 0.99], numeric_only=True))
display(weather_df.quantile([0.01, 0.99], numeric_only=True))

Unnamed: 0,DE_solar_capacity,DE_solar_profile,DE_wind_capacity,DE_wind_profile,DE_wind_offshore_capacity,DE_wind_offshore_profile,DE_wind_onshore_capacity,DE_wind_onshore_profile
count,175167.0,174783.0,175167.0,174869.0,175167.0,174869.0,175167.0,174879.0
mean,42378.981566,0.101903,39974.878687,0.28884,3261.512779,2.238958,36713.377965,0.264938
std,4306.18478,0.156266,7261.521015,0.389282,1358.838591,4.135196,5960.534776,0.398901
min,37248.0,0.0,27913.0,0.0033,667.0,0.0,27246.0,0.003
25%,38810.0,0.0,33737.0,0.1146,2219.0,0.2063,31519.0,0.0974
50%,40941.0,0.0018,39808.0,0.221,3115.0,0.5387,36693.0,0.1918
75%,46092.0,0.1631,47730.0,0.3895,4486.0,0.8999,43243.0,0.3517
max,50508.0,0.6904,50452.0,10.862,5742.0,14.988,44710.0,11.023


Unnamed: 0,DE_temperature,DE_radiation_direct_horizontal,DE_radiation_diffuse_horizontal,hour,is_daylight
count,52584.0,52584.0,52584.0,52584.0,52584.0
mean,9.921879,82.32565,67.377184,11.5,0.583333
std,7.986186,158.118363,89.422665,6.922252,0.493011
min,-12.686,0.0,0.0,0.0,0.0
25%,3.43475,0.0,0.0,5.75,0.0
50%,9.4345,0.41825,6.2939,11.5,1.0
75%,15.987,78.961775,120.6257,17.25,1.0
max,35.479,841.681,393.5256,23.0,1.0


Unnamed: 0,DE_solar_capacity,DE_solar_profile,DE_wind_capacity,DE_wind_profile,DE_wind_offshore_capacity,DE_wind_offshore_profile,DE_wind_onshore_capacity,DE_wind_onshore_profile
0.01,37280.0,0.0,28016.0,0.0184,683.0,0.0072,27334.0,0.0149
0.99,50478.0,0.5767,50329.0,0.885332,5723.0,13.61332,44607.0,0.8739


Unnamed: 0,DE_temperature,DE_radiation_direct_horizontal,DE_radiation_diffuse_horizontal,hour,is_daylight
0.01,-5.55551,0.0,0.0,0.0,0.0
0.99,28.04651,668.034088,337.309482,23.0,1.0


No outliers in solar profile or capacity (itghtly clustered)

Wind features have very suspicious max values -- might want to flag or clip this, but remember that when doing so, you could lose some predictive power?

#electricity_df['DE_wind_profile'] = electricity_df['DE_wind_profile'].clip(upper=0.885)

Weather data has long tails, skewed but natural (log transform)

In [19]:
electricity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175167 entries, 0 to 175166
Data columns (total 16 columns):
 #   Column                                Non-Null Count   Dtype              
---  ------                                --------------   -----              
 0   utc_timestamp                         175167 non-null  datetime64[ns, UTC]
 1   cet_cest_timestamp                    175167 non-null  object             
 2   DE_load_actual_entsoe_transparency    175167 non-null  object             
 3   DE_load_forecast_entsoe_transparency  175071 non-null  object             
 4   DE_solar_capacity                     175167 non-null  int64              
 5   DE_solar_generation_actual            174783 non-null  object             
 6   DE_solar_profile                      174783 non-null  float64            
 7   DE_wind_capacity                      175167 non-null  int64              
 8   DE_wind_generation_actual             174869 non-null  object             
 9   DE_w

In [None]:
# Visualize: for outlier detection

## 5: Merging

In [28]:
# Fix for electricity_df
if electricity_df.index.name != 'utc_timestamp':
    if 'utc_timestamp' in electricity_df.columns:
        electricity_df = electricity_df.set_index('utc_timestamp')
    else:
        raise ValueError("utc_timestamp not found in columns or index of electricity_df")

electricity_df = electricity_df.sort_index()

# Fix for weather_df
if weather_df.index.name != 'utc_timestamp':
    if 'utc_timestamp' in weather_df.columns:
        weather_df = weather_df.set_index('utc_timestamp')
    else:
        raise ValueError("utc_timestamp not found in columns or index of weather_df")

weather_df = weather_df.sort_index()


In [40]:
# Trim weather data to the time range of electricity data
start_time = electricity_df.index.min()
end_time = electricity_df.index.max()
weather_df = weather_df.loc[(weather_df.index >= start_time) & (weather_df.index <= end_time)]

# Reindex weather to match 15-min electricity timestamps
weather_aligned = weather_df.reindex(electricity_df.index)

# Merge both
merged_df = electricity_df.join(weather_aligned, how='left')
merged_df = merged_df.sort_index()

# Columns to fill
cols_to_fill = ['DE_temperature', 'DE_radiation_direct_horizontal', 'DE_radiation_diffuse_horizontal', 'hour']

# Option 1: Interpolated version
merged_interp = merged_df.copy()
merged_interp[cols_to_fill] = merged_interp[cols_to_fill].interpolate(method='time')

# Option 2: Forward-fill version
merged_ffill = merged_df.copy()
merged_ffill[cols_to_fill] = merged_ffill[cols_to_fill].fillna(method='ffill')

# Optional: Handle leading NaNs with bfill
merged_ffill[cols_to_fill] = merged_ffill[cols_to_fill].fillna(method='bfill')
merged_interp[cols_to_fill] = merged_interp[cols_to_fill].fillna(method='bfill')



  merged_ffill[cols_to_fill] = merged_ffill[cols_to_fill].fillna(method='ffill')
  merged_ffill[cols_to_fill] = merged_ffill[cols_to_fill].fillna(method='bfill')
  merged_interp[cols_to_fill] = merged_interp[cols_to_fill].fillna(method='bfill')


In [41]:
merged_ffill.head()

Unnamed: 0_level_0,cet_cest_timestamp,DE_load_actual_entsoe_transparency,DE_load_forecast_entsoe_transparency,DE_solar_capacity,DE_solar_generation_actual,DE_solar_profile,DE_wind_capacity,DE_wind_generation_actual,DE_wind_profile,DE_wind_offshore_capacity,DE_wind_offshore_generation_actual,DE_wind_offshore_profile,DE_wind_onshore_capacity,DE_wind_onshore_generation_actual,DE_wind_onshore_profile,DE_temperature,DE_radiation_direct_horizontal,DE_radiation_diffuse_horizontal,hour
utc_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-01-01 07:15:00+00:00,2015-01-01T08:15:00+0100,40998.2,42497.11,37248,14.18,0.0004,27913,10433.26,0.3738,667,523.96,0.7855,27246,9909.3,0.3637,-1.046,8.8773,51.9464,8.0
2015-01-01 07:30:00+00:00,2015-01-01T08:30:00+0100,41120.9,42458.48,37248,49.02,0.0013,27913,10052.55,0.3601,667,525.52,0.7879,27246,9527.03,0.3497,-1.046,8.8773,51.9464,8.0
2015-01-01 07:45:00+00:00,2015-01-01T08:45:00+0100,41476.39,43085.68,37248,149.14,0.004,27913,9962.65,0.3569,667,527.33,0.7906,27246,9435.32,0.3463,-1.046,8.8773,51.9464,8.0
2015-01-01 08:00:00+00:00,2015-01-01T09:00:00+0100,42120.4,44127.67,37248,340.85,0.0092,27913,9867.04,0.3535,667,527.28,0.7905,27246,9339.76,0.3428,-1.046,8.8773,51.9464,8.0
2015-01-01 08:15:00+00:00,2015-01-01T09:15:00+0100,42624.45,44906.5,37248,572.81,0.0154,27913,10067.22,0.3607,667,527.38,0.7907,27246,9539.84,0.3501,-1.046,8.8773,51.9464,8.0


In [42]:
# show rows 5-10 of merged_df
display(merged_ffill.iloc[0:10])

Unnamed: 0_level_0,cet_cest_timestamp,DE_load_actual_entsoe_transparency,DE_load_forecast_entsoe_transparency,DE_solar_capacity,DE_solar_generation_actual,DE_solar_profile,DE_wind_capacity,DE_wind_generation_actual,DE_wind_profile,DE_wind_offshore_capacity,DE_wind_offshore_generation_actual,DE_wind_offshore_profile,DE_wind_onshore_capacity,DE_wind_onshore_generation_actual,DE_wind_onshore_profile,DE_temperature,DE_radiation_direct_horizontal,DE_radiation_diffuse_horizontal,hour
utc_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-01-01 07:15:00+00:00,2015-01-01T08:15:00+0100,40998.2,42497.11,37248,14.18,0.0004,27913,10433.26,0.3738,667,523.96,0.7855,27246,9909.3,0.3637,-1.046,8.8773,51.9464,8.0
2015-01-01 07:30:00+00:00,2015-01-01T08:30:00+0100,41120.9,42458.48,37248,49.02,0.0013,27913,10052.55,0.3601,667,525.52,0.7879,27246,9527.03,0.3497,-1.046,8.8773,51.9464,8.0
2015-01-01 07:45:00+00:00,2015-01-01T08:45:00+0100,41476.39,43085.68,37248,149.14,0.004,27913,9962.65,0.3569,667,527.33,0.7906,27246,9435.32,0.3463,-1.046,8.8773,51.9464,8.0
2015-01-01 08:00:00+00:00,2015-01-01T09:00:00+0100,42120.4,44127.67,37248,340.85,0.0092,27913,9867.04,0.3535,667,527.28,0.7905,27246,9339.76,0.3428,-1.046,8.8773,51.9464,8.0
2015-01-01 08:15:00+00:00,2015-01-01T09:15:00+0100,42624.45,44906.5,37248,572.81,0.0154,27913,10067.22,0.3607,667,527.38,0.7907,27246,9539.84,0.3501,-1.046,8.8773,51.9464,8.0
2015-01-01 08:30:00+00:00,2015-01-01T09:30:00+0100,43238.67,45320.58,37248,865.88,0.0232,27913,10133.99,0.3631,667,527.46,0.7908,27246,9606.53,0.3526,-1.046,8.8773,51.9464,8.0
2015-01-01 08:45:00+00:00,2015-01-01T09:45:00+0100,43869.42,45725.57,37248,1312.92,0.0352,27913,10047.66,0.36,667,527.4,0.7907,27246,9520.26,0.3494,-1.046,8.8773,51.9464,8.0
2015-01-01 09:00:00+00:00,2015-01-01T10:00:00+0100,44285.28,46159.55,37248,1623.0,0.0436,27913,10208.11,0.3657,667,527.49,0.7908,27246,9680.62,0.3553,0.072,34.1583,97.0929,9.0
2015-01-01 09:15:00+00:00,2015-01-01T10:15:00+0100,44809.3,46744.9,37248,1890.12,0.0507,27913,10399.95,0.3726,667,525.01,0.7871,27246,9874.94,0.3624,0.072,34.1583,97.0929,9.0
2015-01-01 09:30:00+00:00,2015-01-01T10:30:00+0100,45369.58,47555.38,37248,2228.79,0.0598,27913,10682.37,0.3827,667,522.12,0.7828,27246,10160.25,0.3729,0.072,34.1583,97.0929,9.0


In [None]:
# For merged CSV (visualization of file)

#merged_ffill.to_csv("merged_dataset.csv", index=True)
#ran earlier, now showed up!