In [5]:
import pandas as pd

In [6]:
# Load the CSV files for weather and power consumption into pandas dataframes
weather_data = pd.read_csv("Weather_data.csv", dtype={'STATION': 'string'})
power_plants_data = pd.read_csv("Power_Plants_in_the_U.S.csv")

In [36]:
# Convert the 'DATE' column to datetime format in weather_data
weather_data['DATE'] = pd.to_datetime(weather_data['DATE'])
power_plants_data['Period'] = pd.to_datetime(power_plants_data['Period'], format='%Y%m')

# Round latitude and longitude columns for better matching
weather_data['LATITUDE'] = weather_data['LATITUDE'].round(4)
weather_data['LONGITUDE'] = weather_data['LONGITUDE'].round(4)
power_plants_data['Latitude'] = power_plants_data['Latitude'].round(4)
power_plants_data['Longitude'] = power_plants_data['Longitude'].round(4)

In [46]:
# Aggregate the weather data to make it more manageable
columns_to_aggregate = ['LATITUDE','LONGITUDE','DATE', 'TEMP', 'DEWP', 'SLP',
                         'STP', 'VISIB', 'WDSP', 
                         'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP',
                         'SNDP', ]
weather_subset = weather_data[columns_to_aggregate]

# Group by the monthly period (to match the natural gas data) and aggregate weather data for the selected columns
aggregated_weather_data = weather_subset.groupby(['LATITUDE', 'LONGITUDE', pd.Grouper(key='DATE', freq='MS')]).agg(['mean']).reset_index().droplevel(1, axis=1)

aggregated_weather_data.head(5)


Unnamed: 0,LATITUDE,LONGITUDE,DATE,TEMP,DEWP,SLP,STP,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP
0,41.2747,-85.84,2022-01-01,22.132258,14.416129,9999.9,923.93871,8.912903,6.619355,13.222581,241.948387,29.967742,14.754839,74.186129,999.9
1,41.2747,-85.84,2022-02-01,27.142857,19.717857,9999.9,953.521429,8.582143,7.207143,14.192857,161.335714,35.6,19.85,67.850357,999.9
2,41.2747,-85.84,2022-03-01,39.654839,29.858065,9999.9,984.245161,9.13871,7.264516,15.677419,117.974194,51.16129,31.303226,48.382258,999.9
3,41.2747,-85.84,2022-04-01,45.826667,35.43,9999.9,984.483333,9.48,7.633333,15.743333,22.05,55.76,38.3,79.992,999.9
4,41.2747,-85.84,2022-05-01,62.616129,50.967742,9999.9,983.735484,9.470968,5.912903,12.867742,83.425806,73.225806,53.890323,48.382258,999.9


In [47]:
# Function to find the closest weather station for each power plant
def find_nearest_weather_station(row, weather_data):
    # Filter weather stations based on the same period
    filtered_weather_data = weather_data[weather_data['DATE'] == row['Period']]
    if filtered_weather_data.empty:
        return None
    
    # Calculate the Euclidean distance between the power plant and each filtered weather station
    distances = ((filtered_weather_data['LATITUDE'] - row['Latitude'])**2 + (filtered_weather_data['LONGITUDE'] - row['Longitude'])**2)**0.5
    # Find the index of the weather station with the minimum distance
    nearest_station_index = distances.idxmin()
    # Get the data for the nearest weather station
    nearest_station = filtered_weather_data.loc[nearest_station_index]
    
    return nearest_station

In [48]:
# Apply the custom function to each power plant row
closest_weather_stations = power_plants_data.apply(lambda row: find_nearest_weather_station(row, aggregated_weather_data), axis=1)
closest_weather_stations.head(5)


Unnamed: 0,LATITUDE,LONGITUDE,DATE,TEMP,DEWP,SLP,STP,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP
0,46.5333,-90.1333,2023-05-01,55.803226,40.635484,1018.316129,973.454839,9.616129,5.780645,11.748387,398.43871,68.858065,42.306452,0.032903,999.9
1,45.1167,-87.6333,2023-05-01,54.841935,41.416129,1020.270968,769.516129,9.622581,6.345161,12.867742,431.067742,67.564516,41.829032,32.254839,999.9
2,45.1167,-87.6333,2023-05-01,54.841935,41.416129,1020.270968,769.516129,9.622581,6.345161,12.867742,431.067742,67.564516,41.829032,32.254839,999.9
3,45.1167,-87.6333,2023-05-01,54.841935,41.416129,1020.270968,769.516129,9.622581,6.345161,12.867742,431.067742,67.564516,41.829032,32.254839,999.9
4,46.5333,-90.1333,2023-05-01,55.803226,40.635484,1018.316129,973.454839,9.616129,5.780645,11.748387,398.43871,68.858065,42.306452,0.032903,999.9


In [49]:
merged_data = pd.merge(power_plants_data, closest_weather_stations, left_index=True, right_index=True, suffixes=('_plant', '_weather'))
merged_data.drop(columns=['LATITUDE', 'LONGITUDE'], inplace=True)
merged_data.head(5)

Unnamed: 0,X,Y,FID,OBJECTID,Plant_Code,Plant_Name,Utility_ID,Utility_Na,sector_nam,Street_Add,...,SLP,STP,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP
0,-89.685,45.1786,1,13170,4068,Merrill,20860,Wisconsin Public Service Corp,Electric Utility,306 S Park St,...,1018.316129,973.454839,9.616129,5.780645,11.748387,398.43871,68.858065,42.306452,0.032903,999.9
1,-87.7586,45.1136,2,13171,4071,Potato Rapids,20860,Wisconsin Public Service Corp,Electric Utility,N4295 Potato Rapids Rd,...,1020.270968,769.516129,9.622581,6.345161,12.867742,431.067742,67.564516,41.829032,32.254839,999.9
2,-88.0086,44.54,3,13172,4072,Pulliam,20860,Wisconsin Public Service Corp,Electric Utility,1530 N Bylsby Av,...,1020.270968,769.516129,9.622581,6.345161,12.867742,431.067742,67.564516,41.829032,32.254839,999.9
3,-88.0678,45.2333,4,13173,4074,Sandstone Rapids,20860,Wisconsin Public Service Corp,Electric Utility,N7633 Sandstone Lane,...,1020.270968,769.516129,9.622581,6.345161,12.867742,431.067742,67.564516,41.829032,32.254839,999.9
4,-89.7306,45.4411,5,13174,4075,Tomahawk,20860,Wisconsin Public Service Corp,Electric Utility,W6080 Pride Pond Rd,...,1018.316129,973.454839,9.616129,5.780645,11.748387,398.43871,68.858065,42.306452,0.032903,999.9


In [50]:
merged_data.to_csv("Merged_Data.csv", index=False)

In [51]:
# Sanity checks - power plants data and merged data should have the same number of rows
print(len(weather_data))
print(len(power_plants_data))
print(len(merged_data))

119639
12008
12008
