# Exploration of the weather Dataset

### A countrywide dataset of 8.6 million weather events (2016 - 2022)

This repository contains a comprehensive collection of weather events data across 49 states in the United States. The dataset comprises a staggering 8.6 million events, ranging from regular occurrences like rain and snow to extreme weather phenomena such as storms and freezing conditions. The data spans from January 2016 to December 2022 and is sourced from 2,071 airport-based weather stations nationwide

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [2]:
df = pd.read_csv("WeatherEvents_Jan2016-Dec2022.csv")
df.head()

Unnamed: 0,EventId,Type,Severity,StartTime(UTC),EndTime(UTC),Precipitation(in),TimeZone,AirportCode,LocationLat,LocationLng,City,County,State,ZipCode
0,W-1,Snow,Light,2016-01-06 23:14:00,2016-01-07 00:34:00,0.0,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
1,W-2,Snow,Light,2016-01-07 04:14:00,2016-01-07 04:54:00,0.0,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
2,W-3,Snow,Light,2016-01-07 05:54:00,2016-01-07 15:34:00,0.03,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
3,W-4,Snow,Light,2016-01-08 05:34:00,2016-01-08 05:54:00,0.0,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0
4,W-5,Snow,Light,2016-01-08 13:54:00,2016-01-08 15:54:00,0.0,US/Mountain,K04V,38.0972,-106.1689,Saguache,Saguache,CO,81149.0


In [3]:
# df.columns

In [4]:
df.isnull().sum()

EventId                  0
Type                     0
Severity                 0
StartTime(UTC)           0
EndTime(UTC)             0
Precipitation(in)        0
TimeZone                 0
AirportCode              0
LocationLat              0
LocationLng              0
City                 16912
County                   0
State                    0
ZipCode              69199
dtype: int64

In [5]:
df.dtypes

EventId               object
Type                  object
Severity              object
StartTime(UTC)        object
EndTime(UTC)          object
Precipitation(in)    float64
TimeZone              object
AirportCode           object
LocationLat          float64
LocationLng          float64
City                  object
County                object
State                 object
ZipCode              float64
dtype: object

In [6]:
df.describe()

Unnamed: 0,Precipitation(in),LocationLat,LocationLng,ZipCode
count,8627181.0,8627181.0,8627181.0,8557982.0
mean,0.09287441,38.78602,-91.91172,52411.5
std,0.8870326,5.467708,13.5034,25732.49
min,0.0,24.5571,-124.555,1022.0
25%,0.0,34.5995,-97.8236,31216.0
50%,0.0,39.3451,-89.7734,53913.0
75%,0.05,43.0622,-81.9472,73503.0
max,1104.13,48.9402,-67.7928,99362.0


In [7]:
for c in df.columns:
    print(f"Feature: {c}")
    display(df[c].unique())

Feature: EventId


array(['W-1', 'W-2', 'W-3', ..., 'W-9091892', 'W-9091893', 'W-9091894'],
      shape=(8627181,), dtype=object)

Feature: Type


array(['Snow', 'Fog', 'Cold', 'Storm', 'Rain', 'Precipitation', 'Hail'],
      dtype=object)

Feature: Severity


array(['Light', 'Severe', 'Moderate', 'Heavy', 'UNK', 'Other'],
      dtype=object)

Feature: StartTime(UTC)


array(['2016-01-06 23:14:00', '2016-01-07 04:14:00',
       '2016-01-07 05:54:00', ..., '2019-03-29 09:13:00',
       '2019-03-29 20:17:00', '2019-03-29 20:33:00'],
      shape=(2285322,), dtype=object)

Feature: EndTime(UTC)


array(['2016-01-07 00:34:00', '2016-01-07 04:54:00',
       '2016-01-07 15:34:00', ..., '2019-02-22 06:18:00',
       '2019-03-07 03:09:00', '2019-03-29 20:17:00'],
      shape=(2237800,), dtype=object)

Feature: Precipitation(in)


array([0.000e+00, 3.000e-02, 1.000e-02, ..., 8.500e+00, 1.547e+01,
       2.125e+01], shape=(1755,))

Feature: TimeZone


array(['US/Mountain', 'US/Central', 'US/Eastern', 'US/Pacific'],
      dtype=object)

Feature: AirportCode


array(['K04V', 'KAXS', 'KAEL', ..., 'KB23', 'KARL', 'KBVR'],
      shape=(2071,), dtype=object)

Feature: LocationLat


array([38.0972, 34.6986, 43.6822, ..., 40.6   , 41.6   , 42.5833],
      shape=(2056,))

Feature: LocationLng


array([-106.1689,  -99.3381,  -93.3722, ..., -116.8667, -106.21  ,
       -108.2833], shape=(2063,))

Feature: City


array(['Saguache', 'Altus', 'Albert Lea', ..., 'Camdenton',
       'Battle Mountain', 'Rock River'], shape=(1717,), dtype=object)

Feature: County


array(['Saguache', 'Jackson', 'Freeborn', ..., 'Flagler', 'Rio Arriba',
       'Camden'], shape=(1100,), dtype=object)

Feature: State


array(['CO', 'OK', 'MN', 'LA', 'WI', 'ID', 'MI', 'KS', 'WY', 'MA', 'MO',
       'NM', 'NC', 'SC', 'RI', 'VA', 'CT', 'OR', 'ND', 'CA', 'NY', 'OH',
       'SD', 'AZ', 'NV', 'IA', 'TX', 'GA', 'NE', 'TN', 'AL', 'IL', 'AR',
       'WA', 'IN', 'UT', 'FL', 'WV', 'MS', 'PA', 'ME', 'MD', 'NJ', 'KY',
       'VT', 'MT', 'NH', 'DE'], dtype=object)

Feature: ZipCode


array([81149., 73521., 56007., ..., 65020., 89820., 82083.], shape=(2021,))

In [None]:
# Conteggio eventi per stato
state_counts = df['State'].value_counts().reset_index()
state_counts.columns = ['State', 'Count']

# Crea heatmap interattiva con plotly (USA choropleth)
fig = px.choropleth(
    state_counts,
    locations='State',
    locationmode='USA-states',
    color='Count',
    scope='usa',
    title='Heatmap Interattiva degli eventi meteo per Stato (2016-2022)',
    color_continuous_scale='Viridis',
    labels={'Count': 'Numero eventi meteo'}
)

fig.update_layout(
    geo=dict(
        projection_type='albers usa',
        showland=True,
        landcolor='rgb(243, 243, 243)'
    ),
    height=600,
    width=1000
)

fig.show()

## New location feature

### 1. Longitude and latitude cells

In [None]:
# Definisci dimensione griglia (puoi modificare questi valori)
lat_step = 2  # Gradi di latitudine per cella
lon_step = 3  # Gradi di longitudine per cella

# Crea i bins
lat_min, lat_max = df['LocationLat'].min(), df['LocationLat'].max()
lon_min, lon_max = df['LocationLng'].min(), df['LocationLng'].max()

lat_bins = np.arange(np.floor(lat_min), np.ceil(lat_max) + lat_step, lat_step)
lon_bins = np.arange(np.floor(lon_min), np.ceil(lon_max) + lon_step, lon_step)

# Assegna celle
df['LatBin'] = pd.cut(df['LocationLat'], bins=lat_bins, include_lowest=True)
df['LonBin'] = pd.cut(df['LocationLng'], bins=lon_bins, include_lowest=True)

# Crea ID cella univoco
df['GridCellID'] = (
    'Lat_' + df['LatBin'].astype(str).str.extract(r'\(([-\d.]+),')[0] + 
    '_Lon_' + df['LonBin'].astype(str).str.extract(r'\(([-\d.]+),')[0]
)

# Crea coordinate centro cella (per visualizzazione)
df['GridCenterLat'] = df['LatBin'].apply(lambda x: x.mid)
df['GridCenterLon'] = df['LonBin'].apply(lambda x: x.mid)

print(f"Numero di celle create: {df['GridCellID'].nunique()}")
print(f"\nPrime 5 celle più colpite:")
print(df['GridCellID'].value_counts().head())

# Statistiche per cella
grid_stats = df.groupby(['GridCenterLat', 'GridCenterLon']).agg({
    'EventId': 'count',
    'Type': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'Unknown',
    'Severity': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'Unknown'
}).reset_index()
grid_stats.columns = ['Lat', 'Lon', 'EventCount', 'MostCommonType', 'MostCommonSeverity']

In [13]:
# Crea heatmap con scatter plot
fig = go.Figure()

fig.add_trace(go.Scattergeo(
    lon=grid_stats['Lon'],
    lat=grid_stats['Lat'],
    text=[f"Eventi: {c}<br>Tipo: {t}<br>Severità: {s}" 
          for c, t, s in zip(grid_stats['EventCount'], 
                             grid_stats['MostCommonType'],
                             grid_stats['MostCommonSeverity'])],
    mode='markers',
    marker=dict(
        size=grid_stats['EventCount']/500,  # Scala dimensione marker
        color=grid_stats['EventCount'],
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title="N. Eventi"),
        line=dict(width=0.5, color='white')
    ),
    hovertemplate='<b>Cella: %{lat:.1f}°N, %{lon:.1f}°W</b><br>%{text}<extra></extra>'
))

fig.update_layout(
    title=f'Distribuzione Eventi Meteo per Griglia Geografica ({lat_step}°x{lon_step}°)',
    geo=dict(
        scope='usa',
        projection_type='albers usa',
        showland=True,
        landcolor='rgb(243, 243, 243)',
        coastlinecolor='rgb(204, 204, 204)',
    ),
    height=600,
    width=1000
)

fig.show()


### 2. Climate region:

In [None]:
def assign_climate_region(lat, lon):
    """Assegna regioni climatiche basate su caratteristiche geografiche USA"""
    # Pacific Northwest
    if lat >= 42 and lon <= -117:
        return 'Pacific_Northwest'
    # Pacific Southwest (California)
    elif lat < 42 and lon <= -114:
        return 'Pacific_Southwest'
    # Mountain West
    elif lon <= -105 and lat >= 37:
        return 'Mountain_West'
    # Southwest Desert
    elif lon <= -105 and lat < 37:
        return 'Southwest_Desert'
    # Northern Plains
    elif lat >= 42 and -105 < lon <= -90:
        return 'Northern_Plains'
    # Southern Plains
    elif 33 <= lat < 42 and -105 < lon <= -95:
        return 'Southern_Plains'
    # Midwest
    elif 37 <= lat < 45 and -95 < lon <= -82:
        return 'Midwest'
    # Northeast
    elif lat >= 39 and lon > -82:
        return 'Northeast'
    # Southeast
    elif lat < 39 and lon > -90:
        return 'Southeast'
    # Great Lakes
    elif lat >= 41 and -90 < lon <= -82:
        return 'Great_Lakes'
    else:
        return 'Other'

df['ClimateRegion'] = df.apply(
    lambda row: assign_climate_region(row['LocationLat'], row['LocationLng']), 
    axis=1
)

print(f"Regioni climatiche identificate: {df['ClimateRegion'].nunique()}")
print(f"\nDistribuzione eventi per regione:")
print(df['ClimateRegion'].value_counts())



In [12]:
# Visualizza distribuzione per regione
region_stats = df.groupby('ClimateRegion').agg({
    'EventId': 'count',
    'LocationLat': 'mean',
    'LocationLng': 'mean'
}).reset_index()
region_stats.columns = ['Region', 'EventCount', 'AvgLat', 'AvgLon']

fig2 = go.Figure()

fig2.add_trace(go.Scattergeo(
    lon=region_stats['AvgLon'],
    lat=region_stats['AvgLat'],
    text=region_stats['Region'],
    mode='markers+text',
    marker=dict(
        size=region_stats['EventCount']/1000,
        color=region_stats['EventCount'],
        colorscale='Plasma',
        showscale=True,
        colorbar=dict(title="N. Eventi"),
        line=dict(width=1, color='white')
    ),
    textposition='top center',
    textfont=dict(size=10, color='black'),
    hovertemplate='<b>%{text}</b><br>Eventi: %{marker.size:.0f}k<extra></extra>'
))

fig2.update_layout(
    title='Distribuzione Eventi per Regione Climatica',
    geo=dict(
        scope='usa',
        projection_type='albers usa',
        showland=True,
        landcolor='rgb(250, 250, 250)',
    ),
    height=600,
    width=1000
)

fig2.show()