In [1]:

import pandas as pd
import requests
from io import BytesIO

base_url = "https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/"
creation = "20250520"

#function for downloading files
def download_csv_gz(file_name):
    url = base_url + file_name
    print(f"Downloading {url}")
    r = requests.get(url)
    r.raise_for_status()
    return pd.read_csv(BytesIO(r.content), compression='gzip', low_memory=False)

data = {}

for i in range(1951,1981):
    year = i
    details_file = f"StormEvents_details-ftp_v1.0_d{year}_c{creation}.csv.gz"
    df_details = download_csv_gz(details_file)

    df_details.columns = df_details.columns.str.lower()

    assert 'event_id' in df_details.columns, "event_id missing from details"


    df_clean = df_details[['state','year', 'begin_lat','begin_lon']]

    #df_clean.to_csv("{year}file.csv", index=False)
    #print(tornado_summary.head())
    exclude_states =  ['ALASKA', 'HAWAII', 'PUERTO RICO', 'GUAM', 'AMERICAN SAMOA', 'COMMONWEALTH OF THE NORTHERN MARIANA ISLANDS', 'UNITED STATES MINOR OUTLYING ISLANDS']

    state_filtered_df = df_clean[~df_clean['state'].isin(exclude_states)]
    state_filtered_df_clean = state_filtered_df.dropna()

    #state_filtered_df_clean.to_csv(f"{year}_summary.csv", index=False)
    data[i] = state_filtered_df_clean

    

Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1951_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1952_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1953_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1954_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1955_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1956_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1957_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1958_c20250520.csv.gz


In [2]:
datanow = data[1971]

In [3]:
datanow

Unnamed: 0,state,year,begin_lat,begin_lon
0,COLORADO,1971,38.88,-104.80
1,ARKANSAS,1971,36.30,-92.55
2,COLORADO,1971,40.28,-103.50
3,CONNECTICUT,1971,41.88,-72.58
4,MISSISSIPPI,1971,31.23,-89.87
...,...,...,...,...
3466,FLORIDA,1971,29.02,-81.10
3467,FLORIDA,1971,28.62,-80.82
3468,FLORIDA,1971,30.08,-86.50
3469,FLORIDA,1971,27.33,-82.52


In [5]:
import pandas as pd
import numpy as np

# Sample DataFrame
# df = pd.DataFrame({'lat': [...], 'lon': [...]})

# Step 1: Compute center
center_lat = datanow['begin_lat'].mean()
center_lon = datanow['begin_lon'].mean()

# Step 2: Compute Euclidean distance from center
datanow['dist_from_center'] = np.sqrt((datanow['begin_lat'] - center_lat)**2 + (datanow['begin_lon'] - center_lon)**2)

# Step 3: Remove top 10% furthest points
threshold = datanow['dist_from_center'].quantile(0.90)
filtered_data = datanow[datanow['dist_from_center'] <= threshold].copy()

# Optional: Clean up
filtered_data.drop(columns='dist_from_center', inplace=True)


In [6]:
filtered_data

Unnamed: 0,state,year,begin_lat,begin_lon
0,COLORADO,1971,38.88,-104.80
1,ARKANSAS,1971,36.30,-92.55
2,COLORADO,1971,40.28,-103.50
4,MISSISSIPPI,1971,31.23,-89.87
5,NEBRASKA,1971,40.73,-99.78
...,...,...,...,...
3465,FLORIDA,1971,28.30,-81.40
3466,FLORIDA,1971,29.02,-81.10
3468,FLORIDA,1971,30.08,-86.50
3469,FLORIDA,1971,27.33,-82.52
