In [1]:
import pandas as pd

In [2]:
# Load the CSV files for weather and power consumption into pandas dataframes
weather_data = pd.read_csv("Weather_data.csv", dtype={'STATION': 'string'})
power_plants_data = pd.read_csv("MI_Plant_Location.csv")

In [3]:
# Convert the 'DATE' column to datetime format in weather_data
weather_data['DATE'] = pd.to_datetime(weather_data['DATE'])
power_plants_data['period'] = pd.to_datetime(power_plants_data['period'], format='%Y-%m')

power_plants_data=power_plants_data.dropna()

# Round latitude and longitude columns for better matching
weather_data['LATITUDE'] = weather_data['LATITUDE'].round(4)
weather_data['LONGITUDE'] = weather_data['LONGITUDE'].round(4)
power_plants_data['Latitude'] = power_plants_data['Latitude'].round(4)
power_plants_data['Longitude'] = power_plants_data['Longitude'].round(4)

power_plants_data.head(5)

Unnamed: 0,period,plantCode,plantName,fuel2002,fuelTypeDescription,state,stateDescription,primeMover,total-consumption,total-consumption-units,...,NG_MW,Nuclear_MW,Crude_MW,Solar_MW,Wind_MW,Other_MW,Source,Period,Longitude,Latitude
240,2004-08-01,54903,Adrian Energy Associates LLC,NG,Natural Gas,MI,Michigan,ALL,0.0,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-83.9929,41.891
241,2004-08-01,54903,Adrian Energy Associates LLC,NG,Natural Gas,MI,Michigan,IC,0.0,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-83.9929,41.891
242,2002-11-01,54903,Adrian Energy Associates LLC,NG,Natural Gas,MI,Michigan,,589.4,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-83.9929,41.891
243,2002-11-01,54903,Adrian Energy Associates LLC,NG,Natural Gas,MI,Michigan,ALL,589.4,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-83.9929,41.891
244,2004-07-01,54903,Adrian Energy Associates LLC,NG,Natural Gas,MI,Michigan,ALL,0.0,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-83.9929,41.891


In [4]:
# Aggregate the weather data to make it more manageable
columns_to_aggregate = ['LATITUDE','LONGITUDE','DATE', 'TEMP', 'DEWP', 'SLP',
                         'STP', 'VISIB', 'WDSP', 
                         'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP',
                         'SNDP', ]
weather_subset = weather_data[columns_to_aggregate]

# Group by the monthly period (to match the natural gas data) and aggregate weather data for the selected columns
aggregated_weather_data = weather_subset.groupby(['LATITUDE', 'LONGITUDE', pd.Grouper(key='DATE', freq='MS')]).agg(['mean']).reset_index().droplevel(1, axis=1)

aggregated_weather_data.head(5)


Unnamed: 0,LATITUDE,LONGITUDE,DATE,TEMP,DEWP,SLP,STP,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP
0,41.2747,-85.84,2022-01-01,22.132258,14.416129,9999.9,923.93871,8.912903,6.619355,13.222581,241.948387,29.967742,14.754839,74.186129,999.9
1,41.2747,-85.84,2022-02-01,27.142857,19.717857,9999.9,953.521429,8.582143,7.207143,14.192857,161.335714,35.6,19.85,67.850357,999.9
2,41.2747,-85.84,2022-03-01,39.654839,29.858065,9999.9,984.245161,9.13871,7.264516,15.677419,117.974194,51.16129,31.303226,48.382258,999.9
3,41.2747,-85.84,2022-04-01,45.826667,35.43,9999.9,984.483333,9.48,7.633333,15.743333,22.05,55.76,38.3,79.992,999.9
4,41.2747,-85.84,2022-05-01,62.616129,50.967742,9999.9,983.735484,9.470968,5.912903,12.867742,83.425806,73.225806,53.890323,48.382258,999.9


In [5]:
# Function to find the closest weather station for each power plant
def find_nearest_weather_station(row, weather_data):
    # Filter weather stations based on the same period
    filtered_weather_data = weather_data[weather_data['DATE'] == row['period']]
    if filtered_weather_data.empty:
        return None
    
    # Calculate the Euclidean distance between the power plant and each filtered weather station
    distances = ((filtered_weather_data['LATITUDE'] - row['Latitude'])**2 + (filtered_weather_data['LONGITUDE'] - row['Longitude'])**2)**0.5
    # Find the index of the weather station with the minimum distance
    nearest_station_index = distances.idxmin()
    # Get the data for the nearest weather station
    nearest_station = filtered_weather_data.loc[nearest_station_index]
    
    return nearest_station

In [6]:
# Apply the custom function to each power plant row
closest_weather_stations = power_plants_data.apply(lambda row: find_nearest_weather_station(row, aggregated_weather_data), axis=1).to_frame()



In [7]:
print(type(power_plants_data))
print(type(closest_weather_stations))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [10]:
merged_data = pd.merge(power_plants_data, closest_weather_stations, left_index=True, right_index=True, suffixes=('_plant', '_weather'))
merged_data.head(5)

Unnamed: 0,period,plantCode,plantName,fuel2002,fuelTypeDescription,state,stateDescription,primeMover,total-consumption,total-consumption-units,...,Nuclear_MW,Crude_MW,Solar_MW,Wind_MW,Other_MW,Source,Period,Longitude,Latitude,0
240,2004-08-01,54903,Adrian Energy Associates LLC,NG,Natural Gas,MI,Michigan,ALL,0.0,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-83.9929,41.891,
241,2004-08-01,54903,Adrian Energy Associates LLC,NG,Natural Gas,MI,Michigan,IC,0.0,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-83.9929,41.891,
242,2002-11-01,54903,Adrian Energy Associates LLC,NG,Natural Gas,MI,Michigan,,589.4,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-83.9929,41.891,
243,2002-11-01,54903,Adrian Energy Associates LLC,NG,Natural Gas,MI,Michigan,ALL,589.4,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-83.9929,41.891,
244,2004-07-01,54903,Adrian Energy Associates LLC,NG,Natural Gas,MI,Michigan,ALL,0.0,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-83.9929,41.891,


In [11]:
merged_data.to_csv("Merged_Data.csv", index=False)

In [12]:
# Sanity checks - power plants data and merged data should have the same number of rows
print(len(weather_data))
print(len(power_plants_data))
print(len(merged_data))

119639
40494
40494
