In [1]:
import pandas as pd
import os


In [2]:
# Get all Weather data first
# List all files in the directory
folder_path = "../Cleaning"
files = os.listdir(folder_path)

# Filter files that end with "cleaned"
cleaned_files = [file for file in files if file.endswith("cleaned.csv")]

# Read and concatenate the files
weather_data = pd.DataFrame()
for file in cleaned_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, dtype={'STATION': 'string'})
    weather_data = pd.concat([weather_data, df], ignore_index=True)

print(weather_data.columns)
print(weather_data.head(5))

Index(['STATION', 'DATE', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME', 'TEMP',
       'WDSP', 'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP', 'SNDP', 'TEMPEXT_BASE40',
       'TEMPEXT_BASE45', 'TEMPEXT_BASE50'],
      dtype='object')
       STATION        DATE  LATITUDE  LONGITUDE  ELEVATION  \
0  72037392824  2019-01-01  28.00028  -82.16417       46.9   
1  72037392824  2019-01-02  28.00028  -82.16417       46.9   
2  72037392824  2019-01-03  28.00028  -82.16417       46.9   
3  72037392824  2019-01-04  28.00028  -82.16417       46.9   
4  72037392824  2019-01-05  28.00028  -82.16417       46.9   

                                  NAME  TEMP  WDSP  MXSPD  GUST   MAX   MIN  \
0  PLANT CITY MUNICIPAL AIRPORT, FL US  71.8   1.6    8.9   0.0  84.2  62.6   
1  PLANT CITY MUNICIPAL AIRPORT, FL US  73.1   2.4    9.9   0.0  84.2  66.2   
2  PLANT CITY MUNICIPAL AIRPORT, FL US  73.0   4.1    9.9   0.0  82.4  64.4   
3  PLANT CITY MUNICIPAL AIRPORT, FL US  73.2   5.4   15.9  26.0  84.2  66.2   
4  PLA

In [3]:
pp1 = pd.read_csv("../Cleaning/Plant_Location_MI.csv")
pp2 = pd.read_csv("../Cleaning/Plant_Location_WA.csv")
pp3 = pd.read_csv("../Cleaning/Plant_Location_TX.csv")
pp4 = pd.read_csv("../Cleaning/Plant_Location_FL.csv")

power_plants_data = pd.concat([pp1,pp2,pp3,pp4], ignore_index=True)
power_plants_data.head(5)

Unnamed: 0,period,plantCode,plantName,fuel2002,fuelTypeDescription,state,stateDescription,primeMover,total-consumption,total-consumption-units,...,NG_MW,Nuclear_MW,Crude_MW,Solar_MW,Wind_MW,Other_MW,Source,Period,Longitude,Latitude
0,2020-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,84432,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-82.619167,43.266389
1,2020-12,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,11343,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-82.619167,43.266389
2,2023-08,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,109965,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-82.619167,43.266389
3,2023-04,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,12548,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-82.619167,43.266389
4,2023-02,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,96587,MMBtu per Mcf,...,0.0,0.0,0.0,0.0,0.0,0.0,"EIA-860, EIA-860M and EIA-923",202305.0,-82.619167,43.266389


In [4]:
unsup = pd.read_csv("../Analysis/Plant_Clusters.csv")
power_plants_data = power_plants_data.merge(unsup, on='plantCode', how='left')

In [5]:
# Convert the 'DATE' column to datetime format in weather_data
weather_data = weather_data[pd.to_datetime(weather_data['DATE'], errors='coerce').notna()]
weather_data['DATE'] = pd.to_datetime(weather_data['DATE'])

power_plants_data['period'] = pd.to_datetime(power_plants_data['period'], format='%Y-%m')
power_plants_data = power_plants_data[power_plants_data['period']>=pd.to_datetime('2019-01-01')]
power_plants_data = power_plants_data[power_plants_data['primeMover']=="ALL"]
power_plants_data=power_plants_data.dropna()

# Round latitude and longitude columns for better matching
weather_data['LATITUDE'] = weather_data['LATITUDE'].round(4)
weather_data['LONGITUDE'] = weather_data['LONGITUDE'].round(4)
power_plants_data['Latitude'] = power_plants_data['Latitude'].round(4)
power_plants_data['Longitude'] = power_plants_data['Longitude'].round(4)

power_plants_data.head(5)

Unnamed: 0,period,plantCode,plantName,fuel2002,fuelTypeDescription,state,stateDescription,primeMover,total-consumption,total-consumption-units,...,KM8_6,KM8_7,H8_0,H8_1,H8_2,H8_3,H8_4,H8_5,H8_6,H8_7
0,2020-01-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,84432,MMBtu per Mcf,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-12-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,11343,MMBtu per Mcf,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2023-08-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,109965,MMBtu per Mcf,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-04-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,12548,MMBtu per Mcf,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2023-02-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,96587,MMBtu per Mcf,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Aggregate the weather data to make it more manageable
columns_to_aggregate = ['LATITUDE','LONGITUDE','DATE', 'ELEVATION', 'TEMP',
       'WDSP', 'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP', 'SNDP', 'TEMPEXT_BASE40',
       'TEMPEXT_BASE45', 'TEMPEXT_BASE50' ]
weather_subset = weather_data[columns_to_aggregate]

# Group by the monthly period (to match the natural gas data) and aggregate weather data for the selected columns
aggregated_weather_data = weather_subset.groupby(['LATITUDE', 'LONGITUDE', pd.Grouper(key='DATE', freq='MS')]).agg(['mean']).reset_index().droplevel(1, axis=1)

aggregated_weather_data.head(5)


Unnamed: 0,LATITUDE,LONGITUDE,DATE,ELEVATION,TEMP,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP,TEMPEXT_BASE40,TEMPEXT_BASE45,TEMPEXT_BASE50
0,24.46,-81.88,2019-01-01,0.0,69.954839,0.0,0.0,0.0,73.229032,66.906452,0.0,0.0,29.954839,24.954839,19.954839
1,24.46,-81.88,2019-02-01,0.0,74.864286,0.0,0.0,0.0,76.5,73.067857,0.0,0.0,34.864286,29.864286,24.864286
2,24.46,-81.88,2019-03-01,0.0,74.048387,0.0,0.0,0.0,75.867742,72.409677,0.0,0.0,34.048387,29.048387,24.048387
3,24.46,-81.88,2019-04-01,0.0,76.973333,0.0,0.0,0.0,78.336667,75.533333,0.0,0.0,36.973333,31.973333,26.973333
4,24.46,-81.88,2019-05-01,0.0,80.412903,0.0,0.0,0.0,81.529032,78.806452,0.0,0.0,40.412903,35.412903,30.412903


In [7]:
# Function to find the closest weather station for each power plant
def find_nearest_weather_station(row, weather_data):
    # Filter weather stations based on the same period
    filtered_weather_data = weather_data[weather_data['DATE'] == row['period']]
    if filtered_weather_data.empty:
        return None
    
    # Calculate the Euclidean distance between the power plant and each filtered weather station
    distances = ((filtered_weather_data['LATITUDE'] - row['Latitude'])**2 + (filtered_weather_data['LONGITUDE'] - row['Longitude'])**2)**0.5
    # Find the index of the weather station with the minimum distance
    nearest_station_index = distances.idxmin()
    # Get the data for the nearest weather station
    nearest_station = filtered_weather_data.loc[nearest_station_index]
    
    return nearest_station

In [8]:
# Apply the custom function to each power plant row
closest_weather_stations = power_plants_data.apply(lambda row: find_nearest_weather_station(row, aggregated_weather_data), axis=1)

In [9]:
merged_data = pd.merge(power_plants_data, closest_weather_stations, left_index=True, right_index=True, suffixes=('_plant', '_weather'))
merged_data.head(5)

Unnamed: 0,period,plantCode,plantName,fuel2002,fuelTypeDescription,state,stateDescription,primeMover,total-consumption,total-consumption-units,...,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP,TEMPEXT_BASE40,TEMPEXT_BASE45,TEMPEXT_BASE50
0,2020-01-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,84432,MMBtu per Mcf,...,4.396667,10.5,0.0,71.056667,58.35,0.0,0.0,23.723333,18.723333,14.123333
1,2020-12-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,11343,MMBtu per Mcf,...,4.954839,10.467742,0.0,66.354839,50.877419,0.0,0.0,17.293548,12.487097,8.706452
2,2023-08-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,109965,MMBtu per Mcf,...,4.16129,8.86129,0.0,92.887097,81.596774,0.0,0.0,46.403226,41.403226,36.403226
3,2023-04-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,12548,MMBtu per Mcf,...,5.3,10.276667,0.0,79.033333,67.726667,0.0,0.0,32.903333,27.903333,22.903333
4,2023-02-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,96587,MMBtu per Mcf,...,5.007143,9.975,0.0,75.085714,59.167857,0.0,0.0,25.867857,20.867857,15.867857


In [10]:
merged_data.to_csv("Merged_Data.csv", index=False)

In [11]:
# Sanity checks - power plants data and merged data should have the same number of rows
print(len(weather_data))
print(len(power_plants_data))
print(len(merged_data))

686810
6018
6018
