## 1. Import Necessary Libraries

In [1]:
import pandas as pd
from functools import reduce

## 2. Load Datasets
Specify the file paths to your CSV files and load them into DataFrames.


In [2]:
# File paths (update these paths as necessary)
manual_sensors_path = 'data\cleaned_data\cleaned_historic_sensors_data.csv'
weather_data_path = 'data\cleaned_data\cleaned_weather_data.csv'

# Load CSV files into DataFrames
manual_sensors = pd.read_csv(manual_sensors_path)
weather_data = pd.read_csv(weather_data_path)


## 3. Data Preprocessing
Ensure that the timestamp columns are in datetime format and set them as the index.

In [11]:


def prepare_data(df):
    """
    Prepare DataFrame by ensuring the index is a DateTimeIndex, resampling to hourly frequency,
    and handling missing values.
    
    Assumes that the DataFrame is already indexed by the 'Time' column.
    
    Parameters:
    - df: DataFrame with 'Time' as the index.
    
    Returns:
    - df: DataFrame resampled to hourly frequency with missing values handled.
    """
    # Ensure the index is a DateTimeIndex
    if not isinstance(df.index, pd.DatetimeIndex):
        raise TypeError("Index must be a DateTimeIndex.")

    # Resample to hourly frequency and handle missing values
    df = df.asfreq('H')  # Set frequency to hourly
    df = df.ffill().bfill()  # Impute missing values using forward fill and backward fill
    
    return df

# Example usage: Prepare each dataset
manual_sensors = prepare_data(manual_sensors)
weather_data = prepare_data(weather_data)


## 4. Concatenate DataFrames
Combine all datasets based on the timestamp index.

In [15]:
# List of DataFrames to concatenate
data_frames = [manual_sensors, weather_data]

# Concatenate DataFrames on the timestamp index
merged_data = reduce(lambda left, right: pd.concat([left, right], axis=1, join='inner'), data_frames)

# Optional: Handle any remaining missing values using ffill() and bfill()
merged_data.ffill(inplace=True)  # Forward fill
merged_data.bfill(inplace=True)  # Backward fill

# Save the combined dataset to a new CSV
merged_data.to_csv(r'data\cleaned_data\merged_sensor_data.csv')


## 5. Review the Combined Data
Check the first few rows of the merged DataFrame to ensure it looks correct.

In [16]:
# Display the first few rows of the merged dataset
merged_data.head()


Unnamed: 0_level_0,target,Temperature (°C),Relative Humidity (%),Precipitation (mm),Wind Speed (km/h),Sunshine Duration (min)
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-01 00:00:00,65.0,6.4,82.0,0.0,9.0,0.0
2023-01-01 01:00:00,104.0,6.0,84.0,0.0,8.3,0.0
2023-01-01 02:00:00,5.0,6.1,83.0,0.0,7.6,0.0
2023-01-01 03:00:00,0.0,6.6,83.0,0.0,7.9,0.0
2023-01-01 04:00:00,1.0,6.4,83.0,0.0,7.6,0.0


### check for missing values in the aggregated dataset and duplicate timestamps in the index of dataset

In [17]:


# Check for missing values in the aggregated dataset
missing_values = merged_data.isnull().sum()

# Check if there are any rows with duplicate index values
duplicate_index = merged_data.index.duplicated().sum()

# Print the results
print("Missing values per column:")
print(missing_values[missing_values > 0])  # Display columns with missing values

if duplicate_index > 0:
    print(f"\nThere are {duplicate_index} rows with duplicate index values.")
else:
    print("\nNo duplicate index values found.")

# Optional: Display the duplicate rows if any
if duplicate_index > 0:
    print("\nDuplicate rows based on the index:")
    print(merged_data[merged_data.index.duplicated(keep=False)])


Missing values per column:
Series([], dtype: int64)

No duplicate index values found.
