# 7a – Logistic Regression Model Training

This notebook trains a baseline Logistic Regression model
to predict next-day extreme precipitation events.

The model emphasizes recall performance due to the
risk-sensitive nature of extreme weather prediction.


## Import Required Libraries and Initialize AWS Session


In [19]:
import boto3
import sagemaker
from sagemaker import get_execution_role
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score
)

sess = sagemaker.Session()
bucket = sess.default_bucket()
region = boto3.Session().region_name
role = get_execution_role()

project_prefix = "ghcn-extreme"
s3 = boto3.client("s3")

print("Bucket:", bucket)
print("Region:", region)


Bucket: sagemaker-us-east-1-083422367993
Region: us-east-1


## Load Partitioned Parquet Dataset from S3


In [20]:
from pyathena import connect

database_name = "ghcn_extreme_precip_db"
parquet_table = "extreme_precip_parquet"

conn = connect(
    s3_staging_dir=f"s3://{bucket}/athena/staging/",
    region_name=region
)

df = pd.read_sql(
    f"SELECT * FROM {database_name}.{parquet_table}",
    conn
)

# Convert date column from string to datetime
df["date"] = pd.to_datetime(df["date"])

df.head()


  df = pd.read_sql(


Unnamed: 0,station_id,date,tmax,tmin,prcp_lag_1,prcp_roll_7,extreme_precip_tomorrow,month,year
0,USW00012921,2006-02-18,3.9,-1.1,1.5,0.214286,0,2,2006
1,USW00012921,2006-02-19,5.6,-1.7,0.0,0.257143,0,2,2006
2,USW00012921,2006-02-20,8.9,1.7,0.3,0.4,0,2,2006
3,USW00012921,2006-02-21,13.9,6.1,1.0,0.442857,0,2,2006
4,USW00012921,2006-02-22,22.2,12.8,0.3,0.442857,0,2,2006


## Sort by Date and Perform Time-Based Split

A time-aware split prevents leakage and preserves
the chronological structure of weather data.


In [21]:
df = df.sort_values("date").reset_index(drop=True)

split_date = df["date"].quantile(0.8)

train_df = df[df["date"] <= split_date].copy()
val_df = df[df["date"] > split_date].copy()

print("Training size:", train_df.shape)
print("Validation size:", val_df.shape)


Training size: (29155, 9)
Validation size: (7289, 9)


## Separate Features and Target


In [22]:
y_train = train_df["extreme_precip_tomorrow"].astype(int)
y_val = val_df["extreme_precip_tomorrow"].astype(int)

X_train = train_df.drop(columns=["extreme_precip_tomorrow", "date"])
X_val = val_df.drop(columns=["extreme_precip_tomorrow", "date"])


## Encode Categorical Feature (station_id)


In [23]:
X_train = pd.get_dummies(X_train, columns=["station_id"], dummy_na=False)
X_val = pd.get_dummies(X_val, columns=["station_id"], dummy_na=False)

# Align columns
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)


In [24]:
len(df)
df.head()
df["year"].value_counts().sort_index().head()


year
2006    1585
2007    1825
2008    1830
2009    1825
2010    1825
Name: count, dtype: int64

## Initialize Logistic Regression Model

Class imbalance is handled using class weighting.


In [25]:
model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)

model.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [26]:
df = pd.read_sql(
    f"SELECT * FROM {database_name}.{parquet_table}",
    conn
)

df.head()


  df = pd.read_sql(


Unnamed: 0,station_id,date,tmax,tmin,prcp_lag_1,prcp_roll_7,extreme_precip_tomorrow,month,year
0,USW00012921,2025-01-01,17.2,7.8,0.0,0.042857,0,1,2025
1,USW00012921,2021-01-01,13.9,1.1,13.7,2.757143,0,1,2021
2,USW00012921,2025-01-02,16.7,8.3,0.0,0.0,0,1,2025
3,USW00012921,2025-01-03,18.9,11.1,0.0,0.0,0,1,2025
4,USW00012921,2021-01-02,16.7,1.1,0.0,2.757143,0,1,2021


In [27]:
df.shape
df["extreme_precip_tomorrow"].value_counts()


extreme_precip_tomorrow
0    34614
1     1830
Name: count, dtype: int64

## Evaluate Model Performance


In [28]:
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]

print("ROC AUC:", roc_auc_score(y_val, y_proba))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred))


ROC AUC: 0.7204837105806519

Confusion Matrix:
[[4652 2296]
 [ 108  233]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.67      0.79      6948
           1       0.09      0.68      0.16       341

    accuracy                           0.67      7289
   macro avg       0.53      0.68      0.48      7289
weighted avg       0.94      0.67      0.77      7289



## Save Trained Model to S3


In [33]:
import tarfile
import boto3

project_prefix = "ghcn-extreme"

# Recreate model archive with updated inference.py
with tarfile.open("model.tar.gz", "w:gz") as tar:
    tar.add("model.joblib")
    tar.add("inference.py")

s3 = boto3.client("s3")

model_s3_key = f"{project_prefix}/models/model.tar.gz"
s3.upload_file("model.tar.gz", bucket, model_s3_key)

print("Updated model archive uploaded:")
print(f"s3://{bucket}/{model_s3_key}")


Updated model archive uploaded:
s3://sagemaker-us-east-1-083422367993/ghcn-extreme/models/model.tar.gz


## Summary

The Logistic Regression baseline model has been trained
using a time-aware split and evaluated with recall-focused metrics.

The trained model artifact has been saved to S3
for potential registration and deployment.


In [30]:
!tar -tzf model.tar.gz


model.joblib
inference.py


In [31]:
print(model.n_features_in_)


11


In [32]:
print(X.columns.tolist())


In [None]:
print(model.n_features_in_)
print(model.feature_names_in_)


In [None]:
print(model.n_features_in_)


In [None]:
print("Number of training features:", model.n_features_in_)
print("Training feature names:")
print(model.feature_names_in_)


In [34]:
print(type(model))
print(model.feature_names_in_)


<class 'sklearn.linear_model._logistic.LogisticRegression'>
['tmax' 'tmin' 'prcp_lag_1' 'prcp_roll_7' 'month' 'year'
 'station_id_USW00012921' 'station_id_USW00013904'
 'station_id_USW00023174' 'station_id_USW00023293'
 'station_id_USW00094728']


In [35]:
import tarfile

with tarfile.open("model.tar.gz", "r:gz") as tar:
    print(tar.getnames())


['model.joblib', 'inference.py']


In [37]:
import joblib
import tarfile
import boto3

# Save model
joblib.dump(model, "model.joblib")

# Recreate archive INCLUDING updated inference.py
with tarfile.open("model.tar.gz", "w:gz") as tar:
    tar.add("model.joblib")
    tar.add("inference.py")

# Upload
s3 = boto3.client("s3")
model_s3_key = f"{project_prefix}/models/model.tar.gz"
s3.upload_file("model.tar.gz", bucket, model_s3_key)

print("Reuploaded model archive.")


Reuploaded model archive.


In [38]:
!rm -f model.tar.gz
!tar -czvf model.tar.gz model.joblib inference.py


model.joblib
inference.py


In [39]:
!tar -tzf model.tar.gz


model.joblib
inference.py
