In [4]:
import pandas as pd

In [5]:
# Load the CSV files for weather and power consumption into pandas dataframes
weather_data_2019 = pd.read_csv("../Cleaning/Weather_MI_2019_cleaned.csv", dtype={'STATION': 'string'})
weather_data_2020 = pd.read_csv("../Cleaning/Weather_MI_2020_cleaned.csv", dtype={'STATION': 'string'})
weather_data_2021 = pd.read_csv("../Cleaning/Weather_MI_2021_cleaned.csv", dtype={'STATION': 'string'})
weather_data_2022 = pd.read_csv("../Cleaning/Weather_MI_2022_cleaned.csv", dtype={'STATION': 'string'})
weather_data_2023 = pd.read_csv("../Cleaning/Weather_MI_2023_cleaned.csv", dtype={'STATION': 'string'})

weather_data = pd.concat([weather_data_2019, weather_data_2020, weather_data_2021, weather_data_2022, weather_data_2023], ignore_index=True)

power_plants_data = pd.read_csv("../Cleaning/Plant_Location_MI.csv")

In [6]:
# Convert the 'DATE' column to datetime format in weather_data
weather_data = weather_data[pd.to_datetime(weather_data['DATE'], errors='coerce').notna()]
weather_data['DATE'] = pd.to_datetime(weather_data['DATE'])

power_plants_data['period'] = pd.to_datetime(power_plants_data['period'], format='%Y-%m')
power_plants_data = power_plants_data[power_plants_data['period']>=pd.to_datetime('2019-01-01')]
power_plants_data = power_plants_data[power_plants_data['primeMover']=="ALL"]
power_plants_data=power_plants_data.dropna()

# Round latitude and longitude columns for better matching
weather_data['LATITUDE'] = weather_data['LATITUDE'].round(4)
weather_data['LONGITUDE'] = weather_data['LONGITUDE'].round(4)
power_plants_data['Latitude'] = power_plants_data['Latitude'].round(4)
power_plants_data['Longitude'] = power_plants_data['Longitude'].round(4)

power_plants_data.head(5)

Unnamed: 0,period,plantCode,plantName,fuel2002,fuelTypeDescription,state,stateDescription,primeMover,total-consumption,total-consumption-units,...,NG_MW,Nuclear_MW,Crude_MW,Solar_MW,Wind_MW,Other_MW,Source,Period,Longitude,Latitude
0,2020-01-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,84432,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-82.6192,43.2664
1,2020-12-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,11343,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-82.6192,43.2664
2,2023-08-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,109965,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-82.6192,43.2664
3,2023-04-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,12548,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-82.6192,43.2664
4,2023-02-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,96587,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-82.6192,43.2664


In [8]:
# Aggregate the weather data to make it more manageable
columns_to_aggregate = ['LATITUDE','LONGITUDE','DATE', 'TEMP', 'WDSP', 
                         'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP',
                         'SNDP', ]
weather_subset = weather_data[columns_to_aggregate]

# Group by the monthly period (to match the natural gas data) and aggregate weather data for the selected columns
aggregated_weather_data = weather_subset.groupby(['LATITUDE', 'LONGITUDE', pd.Grouper(key='DATE', freq='MS')]).agg(['mean']).reset_index().droplevel(1, axis=1)

aggregated_weather_data.head(5)


Unnamed: 0,LATITUDE,LONGITUDE,DATE,TEMP,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP
0,41.2747,-85.84,2022-01-01,22.132258,6.619355,13.222581,16.164516,29.967742,14.754839,0.0,0.0
1,41.2747,-85.84,2022-02-01,27.142857,7.207143,14.192857,18.492857,35.6,19.85,0.0,0.0
2,41.2747,-85.84,2022-03-01,39.654839,7.264516,15.677419,21.209677,51.16129,31.303226,0.0,0.0
3,41.2747,-85.84,2022-04-01,45.826667,7.633333,15.743333,22.05,55.76,38.3,0.0,0.0
4,41.2747,-85.84,2022-05-01,62.616129,5.912903,12.867742,18.916129,73.225806,53.890323,0.0,0.0


In [9]:
# Function to find the closest weather station for each power plant
def find_nearest_weather_station(row, weather_data):
    # Filter weather stations based on the same period
    filtered_weather_data = weather_data[weather_data['DATE'] == row['period']]
    if filtered_weather_data.empty:
        return None
    
    # Calculate the Euclidean distance between the power plant and each filtered weather station
    distances = ((filtered_weather_data['LATITUDE'] - row['Latitude'])**2 + (filtered_weather_data['LONGITUDE'] - row['Longitude'])**2)**0.5
    # Find the index of the weather station with the minimum distance
    nearest_station_index = distances.idxmin()
    # Get the data for the nearest weather station
    nearest_station = filtered_weather_data.loc[nearest_station_index]
    
    return nearest_station

In [10]:
# Apply the custom function to each power plant row
closest_weather_stations = power_plants_data.apply(lambda row: find_nearest_weather_station(row, aggregated_weather_data), axis=1)

In [11]:
merged_data = pd.merge(power_plants_data, closest_weather_stations, left_index=True, right_index=True, suffixes=('_plant', '_weather'))
merged_data.head(5)

Unnamed: 0,period,plantCode,plantName,fuel2002,fuelTypeDescription,state,stateDescription,primeMover,total-consumption,total-consumption-units,...,LONGITUDE,DATE,TEMP,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP
0,2020-01-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,84432,MMBtu per Mcf,...,-82.5333,2020-01-01,30.729032,8.580645,15.383871,0.0,34.825806,27.370968,0.0,0.0
1,2020-12-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,11343,MMBtu per Mcf,...,-82.5333,2020-12-01,32.380645,9.235484,14.874194,0.0,36.445161,29.370968,0.0,0.0
2,2023-08-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,109965,MMBtu per Mcf,...,-82.42,2023-08-01,67.809677,8.648387,15.622581,0.0,74.925806,61.735484,0.0,0.0
3,2023-04-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,12548,MMBtu per Mcf,...,-82.42,2023-04-01,47.196667,8.24,14.86,0.0,56.623333,39.55,0.0,0.0
4,2023-02-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,96587,MMBtu per Mcf,...,-82.42,2023-02-01,31.228571,8.625,16.435714,0.0,41.103571,25.392857,0.0,0.0


In [12]:
merged_data.to_csv("Merged_Data.csv", index=False)

In [13]:
# Sanity checks - power plants data and merged data should have the same number of rows
print(len(weather_data))
print(len(power_plants_data))
print(len(merged_data))

199595
4375
4375
