<a href="https://colab.research.google.com/github/nlemoff/data102project/blob/main/Question2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

api_key = 'vgyKsY7pYEHEeo0dncg7ho5nlLp0h2JjAAMBT6nL'

# API endpoint
url = 'https://api.eia.gov/v2/electricity/rto/daily-region-data/data/'

def fetch_eia_data(data_type, respondent, start_date, end_date):
    params = {
        'frequency': 'daily',
        'data[0]': 'value',
        'facets[respondent][]': respondent,  # Balancing authority
        'facets[type][]': data_type,  # 'D' for Demand, 'NG' for Net Generation
        'start': start_date,
        'end': end_date,
        'api_key': api_key
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return pd.DataFrame(response.json()['response']['data'])
    else:
        print(f"Error: {response.status_code}")
        print(response.json())
        return pd.DataFrame()

df_demand = fetch_eia_data('D', 'CISO', '2023-01-01', '2023-06-30')  # Early year (likely low demand)
df_demand_high = fetch_eia_data('D', 'CISO', '2023-07-01', '2023-12-31')  # Late year (likely high demand)

df_supply = fetch_eia_data('NG', 'CISO', '2023-01-01', '2023-06-30')
df_supply_high = fetch_eia_data('NG', 'CISO', '2023-07-01', '2023-12-31')

df_demand = pd.concat([df_demand, df_demand_high], ignore_index=True)
df_supply = pd.concat([df_supply, df_supply_high], ignore_index=True)

df_combined = pd.merge(
    df_demand, df_supply,
    on=['period', 'respondent', 'timezone'],
    suffixes=('_demand', '_supply')
)

df_combined['demand_exceeds_supply'] = df_combined['value_demand'] > df_combined['value_supply']

true_cases = df_combined[df_combined['demand_exceeds_supply'] == True]
false_cases = df_combined[df_combined['demand_exceeds_supply'] == False]

false_cases_upsampled = resample(false_cases, replace=True, n_samples=len(true_cases), random_state=42)

df_balanced = pd.concat([true_cases, false_cases_upsampled])

features = ['value_demand', 'value_supply']
X = df_balanced[features]
y = df_balanced['demand_exceeds_supply']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

glm = LogisticRegression()
glm.fit(X_train, y_train)
y_pred_glm = glm.predict(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_glm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_glm))

print("\nKNN Results:")
print(classification_report(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))

Logistic Regression Results:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       347
        True       1.00      1.00      1.00       379

    accuracy                           1.00       726
   macro avg       1.00      1.00      1.00       726
weighted avg       1.00      1.00      1.00       726

Confusion Matrix:
 [[347   0]
 [  1 378]]

KNN Results:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       347
        True       1.00      1.00      1.00       379

    accuracy                           1.00       726
   macro avg       1.00      1.00      1.00       726
weighted avg       1.00      1.00      1.00       726

Confusion Matrix:
 [[347   0]
 [  1 378]]


In [None]:
import requests
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Your EIA API key
api_key = 'vgyKsY7pYEHEeo0dncg7ho5nlLp0h2JjAAMBT6nL'

# API endpoint
url = 'https://api.eia.gov/v2/electricity/rto/daily-region-data/data/'

def fetch_eia_data(data_type, respondent, start_date, end_date):
    params = {
        'frequency': 'daily',
        'data[0]': 'value',
        'facets[respondent][]': respondent,  # Balancing authority
        'facets[type][]': data_type,  # 'D' for Demand, 'NG' for Net Generation
        'start': start_date,
        'end': end_date,
        'api_key': api_key
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return pd.DataFrame(response.json()['response']['data'])
    else:
        print(f"Error: {response.status_code}")
        print(response.json())
        return pd.DataFrame()

df_demand = fetch_eia_data('D', 'CISO', '2023-01-01', '2023-06-30')  # Early year (likely low demand)
df_demand_high = fetch_eia_data('D', 'CISO', '2023-07-01', '2023-12-31')  # Late year (likely high demand)

df_supply = fetch_eia_data('NG', 'CISO', '2023-01-01', '2023-06-30')
df_supply_high = fetch_eia_data('NG', 'CISO', '2023-07-01', '2023-12-31')

df_demand = pd.concat([df_demand, df_demand_high], ignore_index=True)
df_supply = pd.concat([df_supply, df_supply_high], ignore_index=True)

df_combined = pd.merge(
    df_demand, df_supply,
    on=['period', 'respondent', 'timezone'],
    suffixes=('_demand', '_supply')
)

# Load temperature data
temperature_data_path = 'temperature_data_2023.csv'
df_temperature = pd.read_csv(temperature_data_path)

df_temperature['date'] = pd.to_datetime(df_temperature['date'])
df_temperature_agg = df_temperature.groupby('date')['value'].mean().reset_index()
df_temperature_agg.rename(columns={'value': 'avg_temperature'}, inplace=True)

df_combined['date'] = pd.to_datetime(df_combined['period'])

# Merge with aggregated temperature data
df_combined = pd.merge(
    df_combined,
    df_temperature_agg,
    on='date',
    how='left'
)

# Define thresholds for heatwaves and cold snaps
heatwave_threshold = df_combined['avg_temperature'].quantile(0.95)
cold_snap_threshold = df_combined['avg_temperature'].quantile(0.05)

# Add flags for extreme weather
df_combined['is_heatwave'] = (df_combined['avg_temperature'] > heatwave_threshold).astype(int)
df_combined['is_cold_snap'] = (df_combined['avg_temperature'] < cold_snap_threshold).astype(int)

df_combined['demand_exceeds_supply'] = df_combined['value_demand'] > df_combined['value_supply']
true_cases = df_combined[df_combined['demand_exceeds_supply'] == True]
false_cases = df_combined[df_combined['demand_exceeds_supply'] == False]
false_cases_upsampled = resample(false_cases, replace=True, n_samples=len(true_cases), random_state=42)
df_balanced = pd.concat([true_cases, false_cases_upsampled])
features = ['value_demand', 'value_supply', 'avg_temperature', 'is_heatwave', 'is_cold_snap']
X = df_balanced[features]
y = df_balanced['demand_exceeds_supply']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression Model
glm = LogisticRegression()
glm.fit(X_train, y_train)
y_pred_glm = glm.predict(X_test)

# Train KNN Model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# Evaluate Models
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_glm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_glm))

print("\nKNN Results:")
print(classification_report(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))


Logistic Regression Results:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       347
        True       1.00      1.00      1.00       379

    accuracy                           1.00       726
   macro avg       1.00      1.00      1.00       726
weighted avg       1.00      1.00      1.00       726

Confusion Matrix:
 [[347   0]
 [  1 378]]

KNN Results:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       347
        True       1.00      1.00      1.00       379

    accuracy                           1.00       726
   macro avg       1.00      1.00      1.00       726
weighted avg       1.00      1.00      1.00       726

Confusion Matrix:
 [[347   0]
 [  1 378]]


In [None]:
print("Class distribution after balancing:")
print(df_balanced['demand_exceeds_supply'].value_counts())

print("Feature distribution for 'True' cases:")
print(df_balanced[df_balanced['demand_exceeds_supply'] == True][features].describe())

print("\nFeature distribution for 'False' cases:")
print(df_balanced[df_balanced['demand_exceeds_supply'] == False][features].describe())


Class distribution after balancing:
demand_exceeds_supply
True     1813
False    1813
Name: count, dtype: int64
Feature distribution for 'True' cases:
       avg_temperature  is_heatwave  is_cold_snap
count      1813.000000  1813.000000   1813.000000
mean        126.225630     0.046332      0.049641
std          70.383915     0.210261      0.217263
min         -14.009677     0.000000      0.000000
25%          66.337621     0.000000      0.000000
50%         125.474522     0.000000      0.000000
75%         184.656716     0.000000      0.000000
max         278.203125     1.000000      1.000000

Feature distribution for 'False' cases:
       avg_temperature  is_heatwave  is_cold_snap
count      1813.000000  1813.000000        1813.0
mean        168.522172     0.495863           0.0
std          79.607686     0.500121           0.0
min          79.935691     0.000000           0.0
25%          79.935691     0.000000           0.0
50%         139.520900     0.000000           0.0
75%     

In [None]:
numeric_columns = df_balanced[['value_demand', 'value_supply', 'avg_temperature', 'is_heatwave', 'is_cold_snap', 'demand_exceeds_supply']]
correlations = numeric_columns.corr()

print("Correlation with 'demand_exceeds_supply':")
print(correlations['demand_exceeds_supply'].sort_values(ascending=False))

Correlation with 'demand_exceeds_supply':
demand_exceeds_supply    1.000000
is_cold_snap             0.159538
avg_temperature         -0.271003
value_demand            -0.275520
is_heatwave             -0.505629
value_supply            -0.596277
Name: demand_exceeds_supply, dtype: float64


In [None]:
# Weather features only
features_weather = ['avg_temperature', 'is_heatwave', 'is_cold_snap']
X_weather = df_balanced[features_weather]

X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_weather, y, test_size=0.2, random_state=42)

glm_w = LogisticRegression()
glm_w.fit(X_train_w, y_train_w)
y_pred_glm_w = glm_w.predict(X_test_w)
print("Logistic Regression (Weather Features Only):")
print(classification_report(y_test_w, y_pred_glm_w))

knn_w = KNeighborsClassifier(n_neighbors=5)
knn_w.fit(X_train_w, y_train_w)
y_pred_knn_w = knn_w.predict(X_test_w)
print("\nKNN (Weather Features Only):")
print(classification_report(y_test_w, y_pred_knn_w))

Logistic Regression (Weather Features Only):
              precision    recall  f1-score   support

       False       0.67      0.46      0.54       347
        True       0.61      0.79      0.69       379

    accuracy                           0.63       726
   macro avg       0.64      0.62      0.62       726
weighted avg       0.64      0.63      0.62       726


KNN (Weather Features Only):
              precision    recall  f1-score   support

       False       0.99      1.00      0.99       347
        True       1.00      0.99      0.99       379

    accuracy                           0.99       726
   macro avg       0.99      0.99      0.99       726
weighted avg       0.99      0.99      0.99       726

