In [2]:

import pandas as pd
import requests
from io import BytesIO

base_url = "https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/"
creation = "20250520"

#function for downloading files
def download_csv_gz(file_name):
    url = base_url + file_name
    print(f"Downloading {url}")
    r = requests.get(url)
    r.raise_for_status()
    return pd.read_csv(BytesIO(r.content), compression='gzip', low_memory=False)

data_low = {}
data_high = {}

group1 = ['F0', 'F1', 'F2']
group2 = ['F3', 'F4', 'F5']

for i in range(1951,1976):
    year = i
    details_file = f"StormEvents_details-ftp_v1.0_d{year}_c{creation}.csv.gz"
    df_details = download_csv_gz(details_file)

    df_details.columns = df_details.columns.str.lower()

    assert 'event_id' in df_details.columns, "event_id missing from details"


    df_clean = df_details[['state','year', 'begin_lat','begin_lon','tor_f_scale']]

    #df_clean.to_csv("{year}file.csv", index=False)
    #print(tornado_summary.head())
    exclude_states =  ['ALASKA', 'HAWAII', 'PUERTO RICO', 'GUAM', 'AMERICAN SAMOA', 'COMMONWEALTH OF THE NORTHERN MARIANA ISLANDS', 'UNITED STATES MINOR OUTLYING ISLANDS']

    state_filtered_df = df_clean[~df_clean['state'].isin(exclude_states)]
    state_filtered_df_clean = state_filtered_df.dropna()

    #state_filtered_df_clean.to_csv(f"{year}_summary.csv", index=False)
    df = state_filtered_df_clean
    # Split the DataFrame
    df1 = df[df['tor_f_scale'].isin(group1)].copy()
    df2 = df[df['tor_f_scale'].isin(group2)].copy()

    data_low[i] = df1
    data_high[i] = df2



    

Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1951_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1952_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1953_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1954_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1955_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1956_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1957_c20250520.csv.gz
Downloading https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/StormEvents_details-ftp_v1.0_d1958_c20250520.csv.gz


In [3]:
import pandas as pd
import numpy as np

datan = {}

for i in range(1951,1976):
    datanow = data_low[i]
    # Step 1: Compute center
    center_lat = datanow['begin_lat'].mean()
    center_lon = datanow['begin_lon'].mean()

    # Step 2: Compute Euclidean distance from center
    datanow['dist_from_center'] = np.sqrt((datanow['begin_lat'] - center_lat)**2 + (datanow['begin_lon'] - center_lon)**2)

    # Step 3: Remove top 10% furthest points
    threshold = datanow['dist_from_center'].quantile(0.90)
    filtered_data = datanow[datanow['dist_from_center'] <= threshold].copy()

    # Optional: Clean up
    filtered_data.drop(columns='dist_from_center', inplace=True)

    datan[i] = filtered_data
    
    # Create an empty list to store rows
rows = []

for i in range(1951,1976):  # Example loop
    a1 = i
    math_data_df = datan[i][['year','begin_lat','begin_lon']]
    a2 = math_data_df['begin_lat'].mean()
    a3 = math_data_df['begin_lat'].std()
    a4 = math_data_df['begin_lon'].mean()
    a5 = math_data_df['begin_lon'].std()
    
    rows.append([a1, a2, a3, a4 , a5])  # Append the row as a list

# Create DataFrame once after the loop
regress_df = pd.DataFrame(rows, columns=['year', 'begin_lat_mean', 'begin_lat_std', 'begin_lon_mean', 'begin_lon_std'])

In [7]:
import numpy as np
import matplotlib.pyplot as plt
from seaborn import set_style
from sklearn.linear_model import LinearRegression

X_systematic = np.linspace(1951, 1976, 76-51)
x = np.array(regress_df['year'])
y = np.array(regress_df['begin_lat_mean'])

reg = LinearRegression()

reg.fit(x.reshape(-1, 1), y)

y_pred = reg.predict(X_systematic.reshape(-1, 1))

In [8]:
residuals = y - y_pred

In [None]:

n, p = x.shape  # n = number of samples, p = number of features
residual_variance = np.sum(residuals**2) / (n - p)


In [10]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Sample data
x = np.array(regress_df['year'])
y = np.array(regress_df['begin_lat_mean'])

# Fit linear model
X = x.reshape(-1, 1)
model = LinearRegression()
model.fit(X, y)

# Residuals
y_pred = model.predict(X)
residuals = y - y_pred
n = len(x)

# Residual variance
sigma2 = np.sum(residuals**2) / (n - 2)

# Design matrix
X_design = np.column_stack((np.ones(n), x))

# Covariance matrix
XtX_inv = np.linalg.inv(X_design.T @ X_design)
cov_matrix = sigma2 * XtX_inv

# Output
print("Covariance matrix:\n", cov_matrix)


Covariance matrix:
 [[ 1.28587249e+03 -6.55045920e-01]
 [-6.55045920e-01  3.33696343e-04]]


In [11]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Sample data
x = np.array(regress_df['year'])
y = np.array(regress_df['begin_lon_mean'])

# Fit linear model
X = x.reshape(-1, 1)
model = LinearRegression()
model.fit(X, y)

# Residuals
y_pred = model.predict(X)
residuals = y - y_pred
n = len(x)

# Residual variance
sigma2 = np.sum(residuals**2) / (n - 2)

# Design matrix
X_design = np.column_stack((np.ones(n), x))

# Covariance matrix
XtX_inv = np.linalg.inv(X_design.T @ X_design)
cov_matrix = sigma2 * XtX_inv

# Output
print("Covariance matrix:\n", cov_matrix)


Covariance matrix:
 [[ 5.22034654e+03 -2.65933576e+00]
 [-2.65933576e+00  1.35473039e-03]]


In [12]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Sample data
x = np.array(regress_df['year'])
y = np.array(regress_df['begin_lat_std'])

# Fit linear model
X = x.reshape(-1, 1)
model = LinearRegression()
model.fit(X, y)

# Residuals
y_pred = model.predict(X)
residuals = y - y_pred
n = len(x)

# Residual variance
sigma2 = np.sum(residuals**2) / (n - 2)

# Design matrix
X_design = np.column_stack((np.ones(n), x))

# Covariance matrix
XtX_inv = np.linalg.inv(X_design.T @ X_design)
cov_matrix = sigma2 * XtX_inv

# Output
print("Covariance matrix:\n", cov_matrix)


Covariance matrix:
 [[ 2.49068874e+02 -1.26880037e-01]
 [-1.26880037e-01  6.46357804e-05]]


In [13]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Sample data
x = np.array(regress_df['year'])
y = np.array(regress_df['begin_lon_std'])

# Fit linear model
X = x.reshape(-1, 1)
model = LinearRegression()
model.fit(X, y)

# Residuals
y_pred = model.predict(X)
residuals = y - y_pred
n = len(x)

# Residual variance
sigma2 = np.sum(residuals**2) / (n - 2)

# Design matrix
X_design = np.column_stack((np.ones(n), x))

# Covariance matrix
XtX_inv = np.linalg.inv(X_design.T @ X_design)
cov_matrix = sigma2 * XtX_inv

# Output
print("Covariance matrix:\n", cov_matrix)


Covariance matrix:
 [[ 9.08404265e+02 -4.62757008e-01]
 [-4.62757008e-01  2.35739688e-04]]


In [14]:
import pandas as pd
import numpy as np

datan = {}

for i in range(1951,1976):
    datanow = data_high[i]
    # Step 1: Compute center
    center_lat = datanow['begin_lat'].mean()
    center_lon = datanow['begin_lon'].mean()

    # Step 2: Compute Euclidean distance from center
    datanow['dist_from_center'] = np.sqrt((datanow['begin_lat'] - center_lat)**2 + (datanow['begin_lon'] - center_lon)**2)

    # Step 3: Remove top 10% furthest points
    threshold = datanow['dist_from_center'].quantile(0.90)
    filtered_data = datanow[datanow['dist_from_center'] <= threshold].copy()

    # Optional: Clean up
    filtered_data.drop(columns='dist_from_center', inplace=True)

    datan[i] = filtered_data
    
    # Create an empty list to store rows
rows = []

for i in range(1951,1976):  # Example loop
    a1 = i
    math_data_df = datan[i][['year','begin_lat','begin_lon']]
    a2 = math_data_df['begin_lat'].mean()
    a3 = math_data_df['begin_lat'].std()
    a4 = math_data_df['begin_lon'].mean()
    a5 = math_data_df['begin_lon'].std()
    
    rows.append([a1, a2, a3, a4 , a5])  # Append the row as a list

# Create DataFrame once after the loop
regress_df = pd.DataFrame(rows, columns=['year', 'begin_lat_mean', 'begin_lat_std', 'begin_lon_mean', 'begin_lon_std'])

In [15]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Sample data
x = np.array(regress_df['year'])
y = np.array(regress_df['begin_lat_mean'])

# Fit linear model
X = x.reshape(-1, 1)
model = LinearRegression()
model.fit(X, y)

# Residuals
y_pred = model.predict(X)
residuals = y - y_pred
n = len(x)

# Residual variance
sigma2 = np.sum(residuals**2) / (n - 2)

# Design matrix
X_design = np.column_stack((np.ones(n), x))

# Covariance matrix
XtX_inv = np.linalg.inv(X_design.T @ X_design)
cov_matrix = sigma2 * XtX_inv

# Output
print("Covariance matrix:\n", cov_matrix)


Covariance matrix:
 [[ 4.15817405e+03 -2.11824653e+00]
 [-2.11824653e+00  1.07908636e-03]]


In [16]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Sample data
x = np.array(regress_df['year'])
y = np.array(regress_df['begin_lon_mean'])

# Fit linear model
X = x.reshape(-1, 1)
model = LinearRegression()
model.fit(X, y)

# Residuals
y_pred = model.predict(X)
residuals = y - y_pred
n = len(x)

# Residual variance
sigma2 = np.sum(residuals**2) / (n - 2)

# Design matrix
X_design = np.column_stack((np.ones(n), x))

# Covariance matrix
XtX_inv = np.linalg.inv(X_design.T @ X_design)
cov_matrix = sigma2 * XtX_inv

# Output
print("Covariance matrix:\n", cov_matrix)


Covariance matrix:
 [[ 1.33098252e+04 -6.78025756e+00]
 [-6.78025756e+00  3.45402831e-03]]


In [17]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Sample data
x = np.array(regress_df['year'])
y = np.array(regress_df['begin_lat_std'])

# Fit linear model
X = x.reshape(-1, 1)
model = LinearRegression()
model.fit(X, y)

# Residuals
y_pred = model.predict(X)
residuals = y - y_pred
n = len(x)

# Residual variance
sigma2 = np.sum(residuals**2) / (n - 2)

# Design matrix
X_design = np.column_stack((np.ones(n), x))

# Covariance matrix
XtX_inv = np.linalg.inv(X_design.T @ X_design)
cov_matrix = sigma2 * XtX_inv

# Output
print("Covariance matrix:\n", cov_matrix)


Covariance matrix:
 [[ 2.17322209e+03 -1.10707731e+00]
 [-1.10707731e+00  5.63972140e-04]]


In [18]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Sample data
x = np.array(regress_df['year'])
y = np.array(regress_df['begin_lon_std'])

# Fit linear model
X = x.reshape(-1, 1)
model = LinearRegression()
model.fit(X, y)

# Residuals
y_pred = model.predict(X)
residuals = y - y_pred
n = len(x)

# Residual variance
sigma2 = np.sum(residuals**2) / (n - 2)

# Design matrix
X_design = np.column_stack((np.ones(n), x))

# Covariance matrix
XtX_inv = np.linalg.inv(X_design.T @ X_design)
cov_matrix = sigma2 * XtX_inv

# Output
print("Covariance matrix:\n", cov_matrix)


Covariance matrix:
 [[ 1.86511319e+03 -9.50121256e-01]
 [-9.50121256e-01  4.84014904e-04]]
