#  ClimateScope — Semi-Real-Time Weather Analysis (Milestone 1)


## Setup

In [1]:
import os, json, time, subprocess
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path
DATA = Path('data')
for sub in ['raw', 'clean', 'tmp']:
    (DATA / sub).mkdir(parents=True, exist_ok=True)
print('Working dir:', os.getcwd())

Working dir: g:\project_6.0


## Kaggle Credentials

In [2]:
import json
from pathlib import Path
# Try env vars first, else fallback to kaggle (1).json
user = os.environ.get('KAGGLE_USERNAME')
key = os.environ.get('KAGGLE_KEY')
if not (user and key):
    try:
        with open(r'c:\Users\Dell\Downloads\kaggle (1).json') as f:
            creds = json.load(f)
            user = creds.get('username')
            key = creds.get('key')
    except Exception as e:
        print('Could not read kaggle (1).json:', e)
if user and key:
    kaggle_json = {'username': user, 'key': key}
    kaggle_path = Path.home() / '.kaggle'
    kaggle_path.mkdir(exist_ok=True)
    with open(kaggle_path / 'kaggle.json', 'w') as f: json.dump(kaggle_json, f)
    try: os.chmod(kaggle_path / 'kaggle.json', 0o600)
    except Exception: pass
print('Kaggle credentials:', 'OK' if user and key else 'Missing')

Kaggle credentials: OK


In [3]:
# Dataset and path configuration
KAGGLE_SLUG = 'nelgiriyewithana/global-weather-repository'
RAW = DATA / 'raw'
CLEAN = DATA / 'clean'
TMP = DATA / 'tmp'
print(f'Config: {KAGGLE_SLUG}')

Config: nelgiriyewithana/global-weather-repository


## Config

In [5]:
# Find latest CSV in data/raw/** or create sample
csvs = list(RAW.glob('**/*.csv'))
if not csvs:
    sample = RAW / 'sample.csv'
    sample.write_text('date,country,temperature_c,humidity,precipitation_mm,wind_speed_kmh\n' + '\n'.join([f'2025-08-{d:02d},Country{c},{20+c+d},{60+d},{5+d},{10+d}' for c in range(1,4) for d in range(1,8)]))
    csv_path = sample
else:
    csv_path = max(csvs, key=lambda p: p.stat().st_mtime)  # latest by modification time
print('CSV path:', csv_path)

CSV path: data\raw\sample.csv


## Locate csv_path

In [6]:
# Confirm csv_path is ready for next steps
print(f'✓ csv_path defined: {csv_path}')
print(f'✓ File exists: {csv_path.exists()}')
print('✓ Milestone 1 Part 1 complete - ready for automation logic')

✓ csv_path defined: data\raw\sample.csv
✓ File exists: True
✓ Milestone 1 Part 1 complete - ready for automation logic


## Automation

In [7]:
# Row-based trigger using last_updated_epoch
force_refresh = False  # set True to force
epoch_file = TMP / 'last_epoch.txt'
need_refresh = force_refresh
# Will check actual epochs after loading data
print('Automation setup:', 'force' if force_refresh else 'epoch-based')

Automation setup: epoch-based


## Download

In [8]:
# Conditional download - only if need_refresh=True
if need_refresh:
    try:
        subprocess.run(['kaggle', 'datasets', 'download', '-d', KAGGLE_SLUG, '-p', str(RAW), '--force'], check=True)
        print('✓ Fresh data downloaded from Kaggle')
    except Exception as e:
        print(f'Skip: Download failed ({e}), using existing data')
else:
    print('Skip: No refresh needed')

Skip: No refresh needed


## Load Data

In [9]:
# Read CSV and update automation trigger
df = pd.read_csv(csv_path)
epoch_col = 'last_updated_epoch' if 'last_updated_epoch' in df.columns else None
if epoch_col and not force_refresh:
    latest_epoch = str(df[epoch_col].max())
    prev_epoch = epoch_file.read_text().strip() if epoch_file.exists() else ''
    if latest_epoch != prev_epoch:
        epoch_file.write_text(latest_epoch)
print(f'Data loaded: {df.shape}, need_refresh: {need_refresh}')

Data loaded: (21, 6), need_refresh: False


## Column Mapping

In [10]:
# Identify key columns
date_col = 'date' if 'date' in df.columns else df.columns[0]
region_col = 'country' if 'country' in df.columns else df.columns[1]
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
print(f'Mapped - Date: {date_col}, Region: {region_col}, Numeric: {len(num_cols)} cols')

Mapped - Date: date, Region: country, Numeric: 4 cols


In [11]:
# Quick null check
if len(df) > 0:
    key_cols = [date_col, region_col] + num_cols
    na_summary = df[key_cols].isna().sum()
    print('NA counts:', dict(na_summary[na_summary > 0]))
else:
    print('Skip: No data loaded')

NA counts: {}


## Date Casting & Light Handling

In [12]:
# Date conversion and basic cleaning
df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
df = df.dropna(subset=[date_col, region_col])
# Light numeric handling
if 'humidity' in num_cols:
    df['humidity'] = df['humidity'].clip(0, 100)
print(f'Clean shape: {df.shape}')

Clean shape: (21, 6)


## Aggregation

In [13]:
# Create monthly DataFrame with normalized month column
month = df[date_col].dt.to_period('M').dt.to_timestamp()
monthly = df.groupby([month, region_col])[num_cols].mean(numeric_only=True).reset_index()
# Normalize month column name
monthly = monthly.rename(columns={monthly.columns[0]: 'month'})
print(f'Monthly aggregation: {monthly.shape}, columns: {list(monthly.columns)}')

Monthly aggregation: (3, 6), columns: ['month', 'country', 'temperature_c', 'humidity', 'precipitation_mm', 'wind_speed_kmh']


## Save to Partitions

In [14]:
# Save to Parquet partitions by year/month with per-region files
df['year'] = df[date_col].dt.year
df['month_num'] = df[date_col].dt.month
for (y, m), group in df.groupby(['year', 'month_num']):
    outdir = CLEAN / f'{y}/{m:02d}'
    outdir.mkdir(parents=True, exist_ok=True)
    main_file = outdir / 'clean.parquet'
    if not main_file.exists():
        group.to_parquet(main_file)
print(f'✓ Partitioned data saved to {CLEAN}')

✓ Partitioned data saved to data\clean
