#  ClimateScope ‚Äî  (Milestone 1)


## Setup

In [1]:
import os, json, time, subprocess
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path
DATA = Path('data')
for sub in ['raw', 'clean', 'tmp']:
    (DATA / sub).mkdir(parents=True, exist_ok=True)
print('Working dir:', os.getcwd())

Working dir: g:\project_6.0


## Kaggle Credentials

In [2]:
import json
from pathlib import Path
# Try env vars first, else fallback to kaggle (1).json
user = os.environ.get('KAGGLE_USERNAME')
key = os.environ.get('KAGGLE_KEY')
if not (user and key):
    try:
        with open(r'c:\Users\Dell\Downloads\kaggle (1).json') as f:
            creds = json.load(f)
            user = creds.get('username')
            key = creds.get('key')
    except Exception as e:
        print('Could not read kaggle (1).json:', e)
if user and key:
    kaggle_json = {'username': user, 'key': key}
    kaggle_path = Path.home() / '.kaggle'
    kaggle_path.mkdir(exist_ok=True)
    with open(kaggle_path / 'kaggle.json', 'w') as f: json.dump(kaggle_json, f)
    try: os.chmod(kaggle_path / 'kaggle.json', 0o600)
    except Exception: pass
print('Kaggle credentials:', 'OK' if user and key else 'Missing')

Kaggle credentials: OK


In [3]:
# Dataset and path configuration
KAGGLE_SLUG = 'nelgiriyewithana/global-weather-repository'
RAW = DATA / 'raw'
CLEAN = DATA / 'clean'
TMP = DATA / 'tmp'
print(f'Config: {KAGGLE_SLUG}')

Config: nelgiriyewithana/global-weather-repository


## Config

In [5]:
# Find latest CSV in data/raw/** or create sample
csvs = list(RAW.glob('**/*.csv'))
if not csvs:
    sample = RAW / 'sample.csv'
    sample.write_text('date,country,temperature_c,humidity,precipitation_mm,wind_speed_kmh\n' + '\n'.join([f'2025-08-{d:02d},Country{c},{20+c+d},{60+d},{5+d},{10+d}' for c in range(1,4) for d in range(1,8)]))
    csv_path = sample
else:
    csv_path = max(csvs, key=lambda p: p.stat().st_mtime)  # latest by modification time
print('CSV path:', csv_path)

CSV path: data\raw\sample.csv


## Locate csv_path

In [6]:
# Confirm csv_path is ready for next steps
print(f'‚úì csv_path defined: {csv_path}')
print(f'‚úì File exists: {csv_path.exists()}')
print('‚úì Milestone 1 Part 1 complete - ready for automation logic')

‚úì csv_path defined: data\raw\sample.csv
‚úì File exists: True
‚úì Milestone 1 Part 1 complete - ready for automation logic


## Automation

In [7]:
# Row-based trigger using last_updated_epoch
force_refresh = False  # set True to force
epoch_file = TMP / 'last_epoch.txt'
need_refresh = force_refresh
# Will check actual epochs after loading data
print('Automation setup:', 'force' if force_refresh else 'epoch-based')

Automation setup: epoch-based


## Download

In [8]:
# Conditional download - only if need_refresh=True
if need_refresh:
    try:
        subprocess.run(['kaggle', 'datasets', 'download', '-d', KAGGLE_SLUG, '-p', str(RAW), '--force'], check=True)
        print('‚úì Fresh data downloaded from Kaggle')
    except Exception as e:
        print(f'Skip: Download failed ({e}), using existing data')
else:
    print('Skip: No refresh needed')

Skip: No refresh needed


## Load Data

In [9]:
# Read CSV and update automation trigger
df = pd.read_csv(csv_path)
epoch_col = 'last_updated_epoch' if 'last_updated_epoch' in df.columns else None
if epoch_col and not force_refresh:
    latest_epoch = str(df[epoch_col].max())
    prev_epoch = epoch_file.read_text().strip() if epoch_file.exists() else ''
    if latest_epoch != prev_epoch:
        epoch_file.write_text(latest_epoch)
print(f'Data loaded: {df.shape}, need_refresh: {need_refresh}')

Data loaded: (21, 6), need_refresh: False


## Column Mapping

In [10]:
# Identify key columns
date_col = 'date' if 'date' in df.columns else df.columns[0]
region_col = 'country' if 'country' in df.columns else df.columns[1]
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
print(f'Mapped - Date: {date_col}, Region: {region_col}, Numeric: {len(num_cols)} cols')

Mapped - Date: date, Region: country, Numeric: 4 cols


In [None]:
#  null check
if len(df) > 0:
    key_cols = [date_col, region_col] + num_cols
    na_summary = df[key_cols].isna().sum()
    print('NA counts:', dict(na_summary[na_summary > 0]))
else:
    print('Skip: No data loaded')

NA counts: {}


## Date Casting & Light Handling

In [12]:
# Date conversion and basic cleaning
df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
df = df.dropna(subset=[date_col, region_col])
# Light numeric handling
if 'humidity' in num_cols:
    df['humidity'] = df['humidity'].clip(0, 100)
print(f'Clean shape: {df.shape}')

Clean shape: (21, 6)


## Aggregation

In [13]:
# Create monthly DataFrame with normalized month column
month = df[date_col].dt.to_period('M').dt.to_timestamp()
monthly = df.groupby([month, region_col])[num_cols].mean(numeric_only=True).reset_index()
# Normalize month column name
monthly = monthly.rename(columns={monthly.columns[0]: 'month'})
print(f'Monthly aggregation: {monthly.shape}, columns: {list(monthly.columns)}')

Monthly aggregation: (3, 6), columns: ['month', 'country', 'temperature_c', 'humidity', 'precipitation_mm', 'wind_speed_kmh']


## Save to Partitions

In [14]:
# Save to Parquet partitions by year/month with per-region files
df['year'] = df[date_col].dt.year
df['month_num'] = df[date_col].dt.month
for (y, m), group in df.groupby(['year', 'month_num']):
    outdir = CLEAN / f'{y}/{m:02d}'
    outdir.mkdir(parents=True, exist_ok=True)
    main_file = outdir / 'clean.parquet'
    if not main_file.exists():
        group.to_parquet(main_file)
print(f'‚úì Partitioned data saved to {CLEAN}')

‚úì Partitioned data saved to data\clean


# Milestone 2: Core Analysis & Visualization Design

## Setup Visualization Libraries

In [16]:
# Import visualization libraries
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
print('Visualization libraries loaded successfully')

Visualization libraries loaded successfully


## Statistical Analysis

In [17]:
# Basic statistical summary
print('Dataset Overview:')
print(f'Shape: {df.shape}')
print(f'Date range: {df[date_col].min()} to {df[date_col].max()}')
print(f'Regions: {df[region_col].nunique()}')
print('\nNumeric Variables Summary:')
print(df[num_cols].describe())

Dataset Overview:
Shape: (21, 8)
Date range: 2025-08-01 00:00:00 to 2025-08-07 00:00:00
Regions: 3

Numeric Variables Summary:
       temperature_c  humidity  precipitation_mm  wind_speed_kmh
count      21.000000  21.00000          21.00000        21.00000
mean       26.000000  64.00000           9.00000        14.00000
std         2.213594   2.04939           2.04939         2.04939
min        22.000000  61.00000           6.00000        11.00000
25%        24.000000  62.00000           7.00000        12.00000
50%        26.000000  64.00000           9.00000        14.00000
75%        28.000000  66.00000          11.00000        16.00000
max        30.000000  67.00000          12.00000        17.00000


### Distributions

In [18]:
# Distribution plots for numeric variables
for col in num_cols[:4]:  # Show first 4 numeric columns
    fig = px.histogram(df, x=col, nbins=30, title=f'Distribution of {col}')
    fig.show()
    print(f'{col}: Mean={df[col].mean():.2f}, Std={df[col].std():.2f}')

temperature_c: Mean=26.00, Std=2.21


humidity: Mean=64.00, Std=2.05


precipitation_mm: Mean=9.00, Std=2.05


wind_speed_kmh: Mean=14.00, Std=2.05


### Correlations

In [19]:
# Correlation analysis and heatmap
corr_matrix = df[num_cols].corr()
fig = px.imshow(corr_matrix, text_auto=True, title='Correlation Heatmap')
fig.show()
print('Strong correlations (>0.7):')
for i in range(len(corr_matrix)):
    for j in range(i+1, len(corr_matrix)):
        if abs(corr_matrix.iloc[i,j]) > 0.7:
            print(f'{corr_matrix.index[i]} - {corr_matrix.columns[j]}: {corr_matrix.iloc[i,j]:.3f}')

Strong correlations (>0.7):
temperature_c - humidity: 0.926
temperature_c - precipitation_mm: 0.926
temperature_c - wind_speed_kmh: 0.926
humidity - precipitation_mm: 1.000
humidity - wind_speed_kmh: 1.000
precipitation_mm - wind_speed_kmh: 1.000


### Seasonal Patterns

In [20]:
# Seasonal patterns analysis
df['month_name'] = df[date_col].dt.month_name()
df['season'] = df[date_col].dt.month%12 // 3 + 1
season_map = {1:'Winter', 2:'Spring', 3:'Summer', 4:'Fall'}
df['season_name'] = df['season'].map(season_map)

# Monthly trends for first numeric variable
if num_cols:
    var = num_cols[0]
    monthly_avg = df.groupby('month_name')[var].mean().reindex(['January','February','March','April','May','June','July','August','September','October','November','December'])
    fig = px.line(x=monthly_avg.index, y=monthly_avg.values, title=f'Monthly Average {var}')
    fig.show()

### Trends

In [None]:
# Time series trends using monthly data
if num_cols and len(monthly) > 0:
    var = num_cols[0]
    fig = px.line(monthly, x='month', y=var, color=region_col, 
                  title=f'{var} Trends Over Time by Region')
    fig.show()
    
    # Overall trend
    overall_monthly = monthly.groupby('month')[num_cols].mean().reset_index()
    fig2 = px.line(overall_monthly, x='month', y=var, title=f'Overall {var} Trend')
    fig2.show()

## Extreme Weather Events

In [22]:
# Identify extreme weather events
for col in num_cols[:3]:  # First 3 numeric columns
    q95 = df[col].quantile(0.95)
    q05 = df[col].quantile(0.05)
    extremes_high = df[df[col] >= q95]
    extremes_low = df[df[col] <= q05]
    
    print(f'\n{col} Extreme Events:')
    print(f'High extremes (‚â•95th percentile, {q95:.2f}): {len(extremes_high)} events')
    print(f'Low extremes (‚â§5th percentile, {q05:.2f}): {len(extremes_low)} events')
    
    if len(extremes_high) > 0:
        print(f'Regions with most high extremes: {extremes_high[region_col].value_counts().head(3).to_dict()}')


temperature_c Extreme Events:
High extremes (‚â•95th percentile, 29.00): 3 events
Low extremes (‚â§5th percentile, 23.00): 3 events
Regions with most high extremes: {'Country3': 2, 'Country2': 1}

humidity Extreme Events:
High extremes (‚â•95th percentile, 67.00): 3 events
Low extremes (‚â§5th percentile, 61.00): 3 events
Regions with most high extremes: {'Country1': 1, 'Country2': 1, 'Country3': 1}

precipitation_mm Extreme Events:
High extremes (‚â•95th percentile, 12.00): 3 events
Low extremes (‚â§5th percentile, 6.00): 3 events
Regions with most high extremes: {'Country1': 1, 'Country2': 1, 'Country3': 1}


## Regional Comparisons

In [23]:
# Compare weather conditions across regions
regional_stats = df.groupby(region_col)[num_cols].agg(['mean', 'std']).round(2)
print('Regional Weather Statistics:')
print(regional_stats)

# Box plots for regional comparison
if num_cols:
    var = num_cols[0]
    fig = px.box(df, x=region_col, y=var, title=f'{var} Distribution by Region')
    fig.show()
    
    # Scatter plot comparing two variables by region
    if len(num_cols) >= 2:
        fig2 = px.scatter(df, x=num_cols[0], y=num_cols[1], color=region_col,
                         title=f'{num_cols[0]} vs {num_cols[1]} by Region')
        fig2.show()

Regional Weather Statistics:
         temperature_c       humidity       precipitation_mm        \
                  mean   std     mean   std             mean   std   
country                                                              
Country1          25.0  2.16     64.0  2.16              9.0  2.16   
Country2          26.0  2.16     64.0  2.16              9.0  2.16   
Country3          27.0  2.16     64.0  2.16              9.0  2.16   

         wind_speed_kmh        
                   mean   std  
country                        
Country1           14.0  2.16  
Country2           14.0  2.16  
Country3           14.0  2.16  


## Visualization Types Selection

In [24]:
# Demonstrate suitable visualization types for dashboard
print('Recommended Visualization Types:')
print('1. Line Charts: Time series trends, seasonal patterns')
print('2. Heatmaps: Correlation analysis, regional comparisons')
print('3. Scatterplots: Variable relationships, clustering')
print('4. Box Plots: Distribution comparisons by region')
print('5. Bar Charts: Regional statistics, extreme events count')
print('6. Histograms: Data distributions')
print('7. Choropleth Maps: Geographic patterns (if geo data available)')

Recommended Visualization Types:
1. Line Charts: Time series trends, seasonal patterns
2. Heatmaps: Correlation analysis, regional comparisons
3. Scatterplots: Variable relationships, clustering
4. Box Plots: Distribution comparisons by region
5. Bar Charts: Regional statistics, extreme events count
6. Histograms: Data distributions
7. Choropleth Maps: Geographic patterns (if geo data available)


In [25]:
# Sample choropleth map concept (using regional averages)
regional_avg = df.groupby(region_col)[num_cols[0]].mean().reset_index() if num_cols else None
if regional_avg is not None and len(regional_avg) > 0:
    print('\nRegional Average Data for Choropleth Mapping:')
    print(regional_avg)
    print('\nNote: For actual choropleth maps, geographic coordinates would be needed')
    print('This data can be mapped to country/region boundaries for geographic visualization')


Regional Average Data for Choropleth Mapping:
    country  temperature_c
0  Country1           25.0
1  Country2           26.0
2  Country3           27.0

Note: For actual choropleth maps, geographic coordinates would be needed
This data can be mapped to country/region boundaries for geographic visualization


## Interactive Dashboard Design

In [26]:
# Dashboard Layout Design
print('INTERACTIVE WEATHER DASHBOARD WIREFRAME')
print('='*50)
print()
print('üìä TOP SECTION: Key Metrics Panel')
print('   - Current temperature, humidity, precipitation')
print('   - Date range selector, Region filter')
print()
print('üìà LEFT PANEL: Time Series & Trends')
print('   - Line charts for temperature/precipitation trends')
print('   - Seasonal pattern analysis')
print()
print('üó∫Ô∏è  CENTER: Geographic View')
print('   - Choropleth map showing regional weather patterns')
print('   - Interactive region selection')
print()
print('üìã RIGHT PANEL: Analytics')
print('   - Extreme weather events table')
print('   - Correlation heatmap')
print('   - Regional comparison charts')
print()
print('üéõÔ∏è  BOTTOM: Interactive Controls')
print('   - Variable selection dropdown')
print('   - Date range slider')
print('   - Export data button')

INTERACTIVE WEATHER DASHBOARD WIREFRAME

üìä TOP SECTION: Key Metrics Panel
   - Current temperature, humidity, precipitation
   - Date range selector, Region filter

üìà LEFT PANEL: Time Series & Trends
   - Line charts for temperature/precipitation trends
   - Seasonal pattern analysis

üó∫Ô∏è  CENTER: Geographic View
   - Choropleth map showing regional weather patterns
   - Interactive region selection

üìã RIGHT PANEL: Analytics
   - Extreme weather events table
   - Correlation heatmap
   - Regional comparison charts

üéõÔ∏è  BOTTOM: Interactive Controls
   - Variable selection dropdown
   - Date range slider
   - Export data button
