In [50]:
import pandas as pd

In [51]:
# Load the CSV files into pandas dataframes
weather_data = pd.read_csv("Weather_data.csv", dtype={'STATION': 'string'})
power_plants_data = pd.read_csv("Power_Plants_in_the_U.S.csv")

# Convert the 'DATE' column to datetime format in weather_data
weather_data['DATE'] = pd.to_datetime(weather_data['DATE'])

# Round latitude and longitude columns for better matching
weather_data['LATITUDE'] = weather_data['LATITUDE'].round(4)
weather_data['LONGITUDE'] = weather_data['LONGITUDE'].round(4)
power_plants_data['Latitude'] = power_plants_data['Latitude'].round(4)
power_plants_data['Longitude'] = power_plants_data['Longitude'].round(4)

In [52]:
# Aggregate the weather data to make it more manageable
columns_to_aggregate = ['LATITUDE','LONGITUDE','TEMP', 'DEWP', 'SLP',
                         'STP', 'VISIB', 'WDSP', 
                         'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP',
                         'SNDP', ]
weather_subset = weather_data[columns_to_aggregate]

# Group by the monthly period (to match the natural gas data) and aggregate weather data for the selected columns
aggregated_weather_data = weather_subset.groupby(['LATITUDE', 'LONGITUDE', weather_data['DATE'].dt.to_period("M")]).agg('mean').reset_index()

aggregated_weather_data.head(5)


Unnamed: 0,LATITUDE,LONGITUDE,DATE,TEMP,DEWP,SLP,STP,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP
0,41.2747,-85.84,2022-01,22.132258,14.416129,9999.9,923.93871,8.912903,6.619355,13.222581,241.948387,29.967742,14.754839,74.186129,999.9
1,41.2747,-85.84,2022-02,27.142857,19.717857,9999.9,953.521429,8.582143,7.207143,14.192857,161.335714,35.6,19.85,67.850357,999.9
2,41.2747,-85.84,2022-03,39.654839,29.858065,9999.9,984.245161,9.13871,7.264516,15.677419,117.974194,51.16129,31.303226,48.382258,999.9
3,41.2747,-85.84,2022-04,45.826667,35.43,9999.9,984.483333,9.48,7.633333,15.743333,22.05,55.76,38.3,79.992,999.9
4,41.2747,-85.84,2022-05,62.616129,50.967742,9999.9,983.735484,9.470968,5.912903,12.867742,83.425806,73.225806,53.890323,48.382258,999.9


In [53]:
# Function to find the closest weather station for each power plant
def find_nearest_weather_station(row, weather_data):
    # Calculate the Euclidean distance between the power plant and each weather station
    distances = ((weather_data['LATITUDE'] - row['Latitude'])**2 + (weather_data['LONGITUDE'] - row['Longitude'])**2)**0.5
    # Find the index of the weather station with the minimum distance
    nearest_station_index = distances.idxmin()
    # Get the data for the nearest weather station
    nearest_station = weather_data.loc[nearest_station_index]
    return nearest_station

In [54]:
# Apply the custom function to each power plant row
closest_weather_stations = power_plants_data.apply(lambda row: find_nearest_weather_station(row, weather_data), axis=1)
closest_weather_stations.head(5)


Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,TEMP_ATTRIBUTES,DEWP,DEWP_ATTRIBUTES,...,MXSPD,GUST,MAX,MAX_ATTRIBUTES,MIN,MIN_ATTRIBUTES,PRCP,PRCP_ATTRIBUTES,SNDP,FRSHTT
0,72034354852,2022-01-01,44.3333,-89.0197,251.8,"WAUPACA MUNICIPAL AIRPORT, WI US",20.9,24,12.5,24,...,12.0,15.9,31.5,,13.8,,0.0,I,999.9,0
1,72648794896,2021-01-01,45.1167,-87.6333,190.5,"MENOMINEE, MI US",17.9,24,15.5,24,...,8.0,999.9,30.0,,10.9,,0.0,I,999.9,100000
2,72648794896,2021-01-01,45.1167,-87.6333,190.5,"MENOMINEE, MI US",17.9,24,15.5,24,...,8.0,999.9,30.0,,10.9,,0.0,I,999.9,100000
3,72648794896,2021-01-01,45.1167,-87.6333,190.5,"MENOMINEE, MI US",17.9,24,15.5,24,...,8.0,999.9,30.0,,10.9,,0.0,I,999.9,100000
4,72744594926,2021-01-01,46.5333,-90.1333,374.9,"IRONWOOD, MI US",16.7,24,11.4,24,...,12.0,15.0,35.1,,6.1,,0.0,I,999.9,0


In [56]:
merged_data = pd.merge(power_plants_data, closest_weather_stations, left_index=True, right_index=True, suffixes=('_plant', '_weather'))
merged_data.drop(columns=['LATITUDE', 'LONGITUDE'], inplace=True)
merged_data.head(5)

Unnamed: 0,X,Y,FID,OBJECTID,Plant_Code,Plant_Name,Utility_ID,Utility_Na,sector_nam,Street_Add,...,MXSPD,GUST,MAX,MAX_ATTRIBUTES,MIN,MIN_ATTRIBUTES,PRCP,PRCP_ATTRIBUTES,SNDP,FRSHTT
0,-89.685,45.1786,1,13170,4068,Merrill,20860,Wisconsin Public Service Corp,Electric Utility,306 S Park St,...,12.0,15.9,31.5,,13.8,,0.0,I,999.9,0
1,-87.7586,45.1136,2,13171,4071,Potato Rapids,20860,Wisconsin Public Service Corp,Electric Utility,N4295 Potato Rapids Rd,...,8.0,999.9,30.0,,10.9,,0.0,I,999.9,100000
2,-88.0086,44.54,3,13172,4072,Pulliam,20860,Wisconsin Public Service Corp,Electric Utility,1530 N Bylsby Av,...,8.0,999.9,30.0,,10.9,,0.0,I,999.9,100000
3,-88.0678,45.2333,4,13173,4074,Sandstone Rapids,20860,Wisconsin Public Service Corp,Electric Utility,N7633 Sandstone Lane,...,8.0,999.9,30.0,,10.9,,0.0,I,999.9,100000
4,-89.7306,45.4411,5,13174,4075,Tomahawk,20860,Wisconsin Public Service Corp,Electric Utility,W6080 Pride Pond Rd,...,12.0,15.0,35.1,,6.1,,0.0,I,999.9,0


In [57]:
merged_data.to_csv("Merged_Data.csv", index=False)