In [None]:
import numpy as np
import pandas as pd
# Load and prepare daily temps for 2012
merged_df_2012 = pd.read_csv('data/merged_df_2012.csv', parse_dates=['date'])

# Convert date to plain date (so it matches resampled_data.date which is date objects)
merged_df_2012['date'] = merged_df_2012['date'].dt.date

# Ensure temp_min / temp_max are numeric (strip stray characters and convert)
for col in ['temp_min', 'temp_max']:
    if col in merged_df_2012.columns:
        merged_df_2012[col] = pd.to_numeric(
            merged_df_2012[col].astype(str).str.replace(r'[^0-9\.\-]', '', regex=True),
            errors='coerce'
        )
    else:
        merged_df_2012[col] = np.nan

# Quick sanity check
print(f"Loaded merged_df_2012: {merged_df_2012.shape}")
print(merged_df_2012[['date','temp_min','temp_max']].head())


def estimate_temperature(temp_min, temp_max, hour, minute=0, t_peak=15):
    """
    Estimate temperature at a given hour and minute using sinusoidal model.
    
    Parameters:
    - temp_min: daily minimum temperature
    - temp_max: daily maximum temperature
    - hour: hour of day (0-23)
    - minute: minute of hour (0-59)
    - t_peak: hour of peak temperature (default 15:00)
    
    Returns:
    - Estimated temperature at the given time
    """
    # Convert to decimal hours
    t = hour + minute / 60.0
    
    t_avg = (temp_min + temp_max) / 2
    amplitude = (temp_max - temp_min) / 2
    
    # Sinusoidal model: T(t) = T_avg + A * sin(Ï€(t - t_peak)/12)
    temperature = t_avg + amplitude * np.sin(np.pi * (t - t_peak) / 12)
    
    return temperature

# Create minute-level data for the full year 2012
# Start from midnight on 31 Dec 2011 to midnight on 31 Dec 2012
start_date = pd.Timestamp('2012-01-01 00:00:00')
end_date = pd.Timestamp('2012-12-31 23:59:00')

# Create minute-by-minute timestamps
minute_index = pd.date_range(start=start_date, end=end_date, freq='1min')

print(f"Creating dataset with {len(minute_index)} rows")
print(f"Date range: {minute_index[0]} to {minute_index[-1]}")

# Extract date for merging with daily data
resampled_data = pd.DataFrame({
    'datetime': minute_index,
    'date': minute_index.date,
    'hour': minute_index.hour,
    'minute': minute_index.minute
})

# Merge with daily temperature data
merged_with_temps = resampled_data.merge(
    merged_df_2012[['date', 'temp_min', 'temp_max']].drop_duplicates(subset=['date']),
    on='date',
    how='left'
)

# Apply the sinusoidal temperature model
merged_with_temps['estimated_temp'] = merged_with_temps.apply(
    lambda row: estimate_temperature(
        row['temp_min'], 
        row['temp_max'], 
        row['hour'], 
        row['minute']
    ),
    axis=1
)

# Select final columns
final_data = merged_with_temps[['datetime', 'estimated_temp']].copy()

print(f"\nFinal dataset shape: {final_data.shape}")
print(f"Expected rows: 527,040")
print(f"Actual rows: {len(final_data)}")
print(f"\nFirst few rows:")
print(final_data.head())
print(f"\nLast few rows:")
print(final_data.tail())
print(f"\nTemperature statistics:")
print(final_data['estimated_temp'].describe())

# Save the resampled data
output_file = 'data/resampled_minute_data.csv'
final_data.to_csv(output_file, index=False)
print(f"Saved resampled data to {output_file}")

# Also save as Excel for easy viewing
output_excel = 'data/resampled_minute_data.xlsx'
final_data.to_excel(output_excel, index=False)
print(f"Saved resampled data to {output_excel}")

Loaded merged_df_2012: (364, 23)
         date  temp_min  temp_max
0  2012-01-01       3.0       3.4
1  2012-01-02      -0.1       3.4
2  2012-01-03       0.5       3.2
3  2012-01-04       0.5       3.1
4  2012-01-05      -1.4       1.9
Creating dataset with 528480 rows
Date range: 2011-12-31 00:00:00 to 2012-12-31 23:59:00

Final dataset shape: (528480, 2)
Expected rows: 527,040
Actual rows: 528480

First few rows:
             datetime  estimated_temp
0 2011-12-31 00:00:00             NaN
1 2011-12-31 00:01:00             NaN
2 2011-12-31 00:02:00             NaN
3 2011-12-31 00:03:00             NaN
4 2011-12-31 00:04:00             NaN

Last few rows:
                  datetime  estimated_temp
528475 2012-12-31 23:55:00        7.142156
528476 2012-12-31 23:56:00        7.131722
528477 2012-12-31 23:57:00        7.121242
528478 2012-12-31 23:58:00        7.110714
528479 2012-12-31 23:59:00        7.100140

Temperature statistics:
count    524160.000000
mean          7.083379
std    