Weather Data Analysis
 
This notebook fetches weather data from a PostgreSQL database, processes it, and calculates yearly statistics.
Imports We'll start by importing necessary libraries.

In [32]:
import os
import psycopg2
import pandas as pd
from dotenv import load_dotenv

# Load Environment Variables
We'll load database connection details from a `.env` file to keep sensitive information secure.
Load environment variables from .env file

In [33]:
load_dotenv()

# Construct the database connection string from environment variables
DATABASE_URL = f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"

Connect to the Database
We will establish a connection to the PostgreSQL database.
Connect to the database

In [34]:
# Connect to the database
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()

In [35]:
# Fetch weather data from the database
cur.execute("SELECT * FROM weather_data")
weather_data = cur.fetchall()

In [36]:
# Fetch weather data from the database (ignore rows with NULL values for statistics)
cur.execute(""" 
    SELECT station_id, date, max_temp, min_temp, precipitation
    FROM weather_data
    WHERE max_temp IS NOT NULL 
      AND min_temp IS NOT NULL 
      AND precipitation IS NOT NULL
""")
weather_data = cur.fetchall()

In [37]:
weather_df = pd.DataFrame(weather_data, columns=['station_id', 'date', 'max_temp', 'min_temp', 'precipitation']).sort_values(by='date')

In [38]:
weather_df['date'] = pd.to_datetime(weather_df['date'])
weather_df['year'] = weather_df['date'].dt.year

In [39]:
weather_df['precipitation'] = weather_df['precipitation'] / 10.0

In [40]:
# We'll calculate the average maximum temperature, average minimum temperature, and total precipitation for each station by year.

# %%
yearly_stats_df = weather_df.groupby(['station_id', 'year']).agg(
    avg_max_temp=('max_temp', 'mean'),
    avg_min_temp=('min_temp', 'mean'),
    total_precipitation=('precipitation', 'sum')
).reset_index()

In [41]:
# Finally, we'll insert the calculated yearly statistics into the `yearly_weather_stats` table, updating existing records if necessary.

# %%
for index, row in yearly_stats_df.iterrows():
    cur.execute(""" 
        INSERT INTO yearly_weather_stats (station_id, year, avg_max_temp, avg_min_temp, total_precipitation)
        VALUES (%s, %s, %s, %s, %s)
        ON CONFLICT (station_id, year) DO UPDATE 
        SET avg_max_temp = EXCLUDED.avg_max_temp, 
            avg_min_temp = EXCLUDED.avg_min_temp, 
            total_precipitation = EXCLUDED.total_precipitation;
    """, (row['station_id'], row['year'], row['avg_max_temp'], row['avg_min_temp'], row['total_precipitation']))

# Commit the transaction and close the connection
conn.commit()
cur.close()
conn.close()