**Target Audience:**
This analysis is aimed at urban planners, renewable energy developers, and agricultural stakeholders in Chicago, Illinois. These groups rely on historical weather data to make informed decisions—whether it’s designing storm-resilient infrastructure, optimizing solar energy production, or planning crop cycles and irrigation.

**Purpose and Motivation:**
The goal of this study is to explore historical weather data from 2000 to 2026 for Chicago, focusing on temperature, precipitation, wind, and sunlight patterns. By engineering meaningful features, such as rolling averages, precipitation flags, and interaction terms, the analysis provides insights that help stakeholders anticipate extreme weather, optimize operations, and make data-driven decisions.

In [None]:
import requests
import json 
from pprint import pprint 
import seaborn as sns
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib
import numpy as np

In [None]:
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 52.52,
	"longitude": 13.41,
	"start_date": "2000-01-01",
	"end_date": "2026-02-06",
	"daily": ["weather_code", "temperature_2m_mean", "precipitation_sum", "daylight_duration", "sunshine_duration", "apparent_temperature_mean", "sunrise", "sunset", "rain_sum", "snowfall_sum", "precipitation_hours", "wind_direction_10m_dominant", "relative_humidity_2m_mean", "pressure_msl_mean", "dew_point_2m_mean", "cloud_cover_mean", "surface_pressure_mean", "wind_gusts_10m_mean", "wind_speed_10m_mean"],
	"timezone": "America/Chicago",
}
response = requests.get(url, params=params)

print(response.status_code)
print(response.headers.get("Content-Type"))
print(response.text[:200])  

data = response.json()

In [None]:
for r in response.json():
    print(r)

In [None]:
weather_df = pd.DataFrame(response.json()['daily'])
weather_df.head()

In [None]:
weather_df.shape

In [None]:
weather_df.info()

In [None]:
weather_df.describe()

In [None]:
weather_df.isna().head()

In [None]:
weather_df.isna().sum()

In [None]:
weather_df["time"] = pd.to_datetime(weather_df["time"])
weather_df["year"] = weather_df["time"].dt.year
weather_df["month"] = weather_df["time"].dt.month
weather_df["day"] = weather_df["time"].dt.day

weather_df.head()

In [None]:
weather_df.groupby('month').temperature_2m_mean.agg(['mean','std', 'median', 'count'])

# Naeliz's Section

This section will cover an extensive EDA

In [None]:
# Set up to cover time features and seasons

weather_df = weather_df.copy()
weather_df["time"] = pd.to_datetime(weather_df["time"])

weather_df["year"]  = weather_df["time"].dt.year
weather_df["month"] = weather_df["time"].dt.month
weather_df["day"]   = weather_df["time"].dt.day

season_map = {
    12:"Winter", 1:"Winter", 2:"Winter",
    3:"Spring", 4:"Spring", 5:"Spring",
    6:"Summer", 7:"Summer", 8:"Summer",
    9:"Fall", 10:"Fall", 11:"Fall"
}
weather_df["season"] = weather_df["month"].map(season_map)

weather_df = weather_df.sort_values("time").reset_index(drop=True)
weather_df.head()

In [None]:
# Monthly breakdown 

monthly = weather_df.groupby("month").agg(
    temp_mean=("temperature_2m_mean", "mean"),
    temp_p10=("temperature_2m_mean", lambda x: np.percentile(x, 10)),
    temp_p90=("temperature_2m_mean", lambda x: np.percentile(x, 90)),

    precip_mean=("precipitation_sum", "mean"),
    precip_p90=("precipitation_sum", lambda x: np.percentile(x, 90)),

    wind_mean=("wind_speed_10m_mean", "mean"),
    wind_p90=("wind_speed_10m_mean", lambda x: np.percentile(x, 90)),

    sunshine_mean=("sunshine_duration", "mean"),
    daylight_mean=("daylight_duration", "mean"),

    n_days=("time","count")
).reset_index()

monthly

In [None]:
# Seasonal breakdown

seasonal = weather_df.groupby("season").agg(
    temp_mean=("temperature_2m_mean", "mean"),
    precip_mean=("precipitation_sum", "mean"),
    wind_mean=("wind_speed_10m_mean", "mean"),
    sunshine_mean=("sunshine_duration", "mean"),
    n_days=("time","count")
).reset_index()

seasonal

In [None]:
# Advisory of temperature for a traveler 

plt.figure()
plt.plot(monthly["month"], monthly["temp_mean"], marker="o")
plt.fill_between(monthly["month"], monthly["temp_p10"], monthly["temp_p90"], alpha=0.2)
plt.title("Temperature by Month ")
plt.xlabel("Month")
plt.ylabel("Temp (°C)")
plt.xticks(range(1,13))
plt.show()

In [None]:
# Rain and wind seasonality (weather planning)

plt.figure()
plt.plot(monthly["month"], monthly["precip_mean"], marker="o")
plt.title("Average Daily Precipitation by Month")
plt.xlabel("Month")
plt.ylabel("Precip (mm/day)")
plt.xticks(range(1,13))
plt.show()

plt.figure()
plt.plot(monthly["month"], monthly["wind_mean"], marker="o")
plt.title("Average Daily Wind Speed by Month")
plt.xlabel("Month")
plt.ylabel("Wind speed (10m mean)")
plt.xticks(range(1,13))
plt.show()

In [None]:
# Sunshine and daylight 

plt.figure()
plt.plot(monthly["month"], monthly["sunshine_mean"], marker="o")
plt.title("Average Sunshine Duration by Month")
plt.xlabel("Month")
plt.ylabel("Sunshine duration (seconds)")
plt.xticks(range(1,13))
plt.show()

plt.figure()
plt.plot(monthly["month"], monthly["daylight_mean"], marker="o")
plt.title("Average Daylight Duration by Month")
plt.xlabel("Month")
plt.ylabel("Daylight duration (seconds)")
plt.xticks(range(1,13))
plt.show()

Correlation and Heat Map plots

In [None]:
# Correlation heat map between columns

heat_cols = [
    "temperature_2m_mean",
    "apparent_temperature_mean",
    "rain_sum",
    "snowfall_sum",
    "cloud_cover_mean",
    "relative_humidity_2m_mean",
    "pressure_msl_mean",
    "wind_speed_10m_mean",
    "wind_gusts_10m_mean",
    "sunshine_duration"]

corr = weather_df[heat_cols].corr()

plt.figure(figsize=(9,7))
sns.heatmap(corr, annot=True, fmt=".2f")
plt.title("Correlation Heatmap (Weather Variables)")
plt.show()

In [None]:
# Pivot plot (Year x month) to do a seperate heatmap for temperature, rain and sunshine

pivotTemp = weather_df.groupby(["year", "month"])["temperature_2m_mean"].mean().unstack()
pivotRain = weather_df.groupby(["year", "month"])["rain_sum"].mean().unstack()
pivotSun  = weather_df.groupby(["year", "month"])["sunshine_duration"].mean().unstack()

# Heatmap 1: Temperature
plt.figure(figsize=(10,6))
sns.heatmap(pivotTemp)
plt.title("Year x Month Heatmap: Monthly Mean Temperature (°C)")
plt.xlabel("Month")
plt.ylabel("Year")
plt.show()

# Heatmap 2: Rain
plt.figure(figsize=(10,6))
sns.heatmap(pivotRain)
plt.title("Year x Month Heatmap: Monthly Mean Rain (rain_sum)")
plt.xlabel("Month")
plt.ylabel("Year")
plt.show()

# Heatmap 3: Sunshine duration
plt.figure(figsize=(10,6))
sns.heatmap(pivotSun)
plt.title("Year x Month Heatmap: Monthly Mean Sunshine Duration")
plt.xlabel("Month")
plt.ylabel("Year")
plt.show()

Pair plot and histogram of weather variables

In [None]:
pair_cols = [
    "temperature_2m_mean",
    "apparent_temperature_mean",
    "rain_sum",
    "cloud_cover_mean",
    "wind_speed_10m_mean",
    "sunshine_duration"]

pair_sample = weather_df[pair_cols].dropna().sample(
    n=min(2000, weather_df.shape[0]),
    random_state=0)

sns.pairplot(pair_sample)
plt.show()

In [None]:
hist_cols = [
    "temperature_2m_mean",
    "apparent_temperature_mean",
    "precipitation_sum",
    "rain_sum",
    "snowfall_sum",
    "wind_speed_10m_mean",
    "wind_gusts_10m_mean",
    "cloud_cover_mean",
    "relative_humidity_2m_mean",
    "sunshine_duration",
    "daylight_duration"]

for col in hist_cols:
    plt.figure()
    plt.hist(weather_df[col].dropna(), bins=40)
    plt.title(f"Histogram: {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()

# Feature exploration and feature engineering

In this section, we will transform raw data into features that capture patterns better for analysis or modeling.


**Time based features**

Converts time column to datetime.

Extracts: year, month, day, day of week, ISO week number.

Useful for seasonal and weekly patterns, and for ML models that need numeric features.

In [None]:
weather_df['time'] = pd.to_datetime(weather_df['time'])

weather_df['year'] = weather_df['time'].dt.year
weather_df['month'] = weather_df['time'].dt.month
weather_df['day'] = weather_df['time'].dt.day
weather_df['day_of_week'] = weather_df['time'].dt.dayofweek
weather_df['week_of_year'] = weather_df['time'].dt.isocalendar().week

**Temperature Range**

rolling(7) and rolling(30) compute weekly and monthly moving averages.

temp_7day_std measures short-term variability.

Smooths daily fluctuations and captures trends and volatility in temperature.

In [None]:
weather_df['temp_7day_avg'] = weather_df['temperature_2m_mean'].rolling(7).mean()
weather_df['temp_30day_avg'] = weather_df['temperature_2m_mean'].rolling(30).mean()
weather_df['temp_7day_std'] = weather_df['temperature_2m_mean'].rolling(7).std()
weather_df

**Precipitation Indicators**

is_rainy / is_snowy → binary flags for precipitation.

heavy_rain → flags extreme precipitation events, using the 95th percentile threshold.

Helps in identifying significant weather events for analysis or modeling.

In [None]:
#Rain Indicator (Binary Variable)
weather_df['is_rainy'] = (weather_df['rain_sum'] > 0).astype(int)

In [None]:
#Snowfall Indicator
weather_df['is_snowy'] = (weather_df['snowfall_sum'] > 0).astype(int)

In [None]:
#Heavy Precipitation
heavy_rain_threshold = weather_df['precipitation_sum'].quantile(0.95)

weather_df['heavy_rain'] = (
    weather_df['precipitation_sum'] > heavy_rain_threshold
).astype(int)

In [None]:
weather_df

**Sunshine and wind features**

sunshine_ratio → fraction of daylight that was sunny (0–1). Normalizes sunshine across seasons.

gust_ratio → indicates wind gustiness relative to average wind speed.

Useful for weather comfort indices, energy models, or hazard analysis.

In [None]:
#Sunshine Ratio
weather_df['sunshine_ratio'] = (
    weather_df['sunshine_duration'] /
    weather_df['daylight_duration']
)

In [None]:
#Wind Intensity Ratio
weather_df['gust_ratio'] = (
    weather_df['wind_gusts_10m_mean'] /
    weather_df['wind_speed_10m_mean']
)

In [None]:
weather_df

**Seasonal features**

Maps months to seasons: winter, spring, summer, fall.

In [None]:
def get_season(month):
    if month in [12, 1, 2]:
        return 'winter'
    elif month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    else:
        return 'fall'

weather_df['season'] = weather_df['month'].apply(get_season)

weather_df

**One Hot Encoding**

One-hot encodes the seasons for machine learning, dropping one category (drop_first=True) to avoid multicollinearity.

In [None]:
weather_df = pd.get_dummies(
    weather_df,
    columns=['season'],
    drop_first=True
)
weather_df

**Outlier Detection**

Uses Interquartile Range (IQR) method to flag unusually low or high temperatures.

Helps identify extreme events or potential data errors

In [None]:
col = 'temperature_2m_mean'

Q1 = weather_df[col].quantile(0.25)
Q3 = weather_df[col].quantile(0.75)
IQR = Q3 - Q1

# Detect outliers
outliers_iqr = weather_df[
    (weather_df[col] < Q1 - 1.5*IQR) |
    (weather_df[col] > Q3 + 1.5*IQR)
]

print(f"Number of outliers in {col}: {outliers_iqr.shape[0]}")


Plots a boxplot of temperature, highlighting median, IQR, whiskers, and outliers.

Quick visual check for extreme values and distribution shape.

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x=weather_df[col])
plt.title(f"Boxplot of {col}")
plt.show()