In [11]:
# imports
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"


In [12]:
# read csv
df = pd.read_csv(filepath_or_buffer='tornados.csv', low_memory=False, parse_dates=['datetime_utc'], index_col=['om'])

In [17]:
"""DATASET INFORMATION:

om	- integer - Tornado number. Effectively an ID for this tornado in this year.
yr	- integer	- Year, 1950-2022.
mo	- integer	- Month, 1-12.
dy	- integer	- Day, 1-31.
date	- string	- Date in MM/DD/YYYY format.
time	- string	- Time in HH:MM format, in 24-hour clock.
datetime_utc	- datetime	- Date and time in UTC.
st	- character	- Two-letter postal abbreviation for the state (DC = Washington, DC; PR = Puerto Rico; VI = Virgin Islands).
stf	- integer	- State FIPS code.
mag	- integer	- Magnitude on the F scale (EF beginning in 2007). Some of these values are estimated (see fc).
inj	- integer	- Number of injuries. When summing for state totals, use sn == 1 (see below).
fat	- integer	- Number of fatalities. When summing for state totals, use sn == 1 (see below).
loss -double -	Estimated property loss information in dollars. Prior to 1996, values were grouped into ranges. The reported number for such years is the maximum of its range.
slat	- double	- Tornado starting latitude in degrees, decimal.
slon	- double	- Tornado starting longitude in degrees, decimal.
elat	- double	- Tornado ending latitude in degrees, decimal.
elon	- double	- Tornado ending longitude in degrees, decimal.
len  - double	- Tornado path length in miles.
wid  - double	- Tornado path width in yards.
ns	- integer	- Number of states affected. The maximum is 3.
sn	- integer	- State number for this row. 1 means the row contains the entire track information for this state, 0 means there is at least one more entry for this state for this tornado (om + yr).
"""

'DATASET INFORMATION:\n\nom\t- integer - Tornado number. Effectively an ID for this tornado in this year.\nyr\t- integer\t- Year, 1950-2022.\nmo\t- integer\t- Month, 1-12.\ndy\t- integer\t- Day, 1-31.\ndate\t- string\t- Date in MM/DD/YYYY format.\ntime\t- string\t- Time in HH:MM format, in 24-hour clock.\ndatetime_utc\t- datetime\t- Date and time in UTC.\nst\t- character\t- Two-letter postal abbreviation for the state (DC = Washington, DC; PR = Puerto Rico; VI = Virgin Islands).\nstf\t- integer\t- State FIPS code.\nmag\t- integer\t- Magnitude on the F scale (EF beginning in 2007). Some of these values are estimated (see fc).\ninj\t- integer\t- Number of injuries. When summing for state totals, use sn == 1 (see below).\nfat\t- integer\t- Number of fatalities. When summing for state totals, use sn == 1 (see below).\nloss -double -\tEstimated property loss information in dollars. Prior to 1996, values were grouped into ranges. The reported number for such years is the maximum of its range

In [18]:
df.head()

Unnamed: 0_level_0,yr,mo,dy,date,time,tz,datetime_utc,st,stf,mag,...,elon,len,wid,ns,sn,f1,f2,f3,f4,fc
om,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
192,1950,10,1,1950-10-01,21:00:00,America/Chicago,1950-10-02 03:00:00+00:00,OK,40,1.0,...,-102.3,15.8,10,1,1,25,0,0,0,False
193,1950,10,9,1950-10-09,02:15:00,America/Chicago,1950-10-09 08:15:00+00:00,NC,37,3.0,...,0.0,2.0,880,1,1,47,0,0,0,False
195,1950,11,20,1950-11-20,02:20:00,America/Chicago,1950-11-20 08:20:00+00:00,KY,21,2.0,...,0.0,0.1,10,1,1,177,0,0,0,False
196,1950,11,20,1950-11-20,04:00:00,America/Chicago,1950-11-20 10:00:00+00:00,KY,21,1.0,...,0.0,0.1,10,1,1,209,0,0,0,False
197,1950,11,20,1950-11-20,07:30:00,America/Chicago,1950-11-20 13:30:00+00:00,MS,28,1.0,...,0.0,2.0,37,1,1,101,0,0,0,False


In [19]:
# set df to only include the columns we need
df = df[['yr', 'mo', 'dy', 'date', 'time', 'datetime_utc', 'st', 'mag', 'inj', 'fat', 'loss', 'slat', 'slon', 'elat', 'elon', 'len', 'wid', 'ns', 'sn']]
df.head()

Unnamed: 0_level_0,yr,mo,dy,date,time,datetime_utc,st,mag,inj,fat,loss,slat,slon,elat,elon,len,wid,ns,sn
om,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
192,1950,10,1,1950-10-01,21:00:00,1950-10-02 03:00:00+00:00,OK,1.0,0,0,50000.0,36.73,-102.52,36.88,-102.3,15.8,10,1,1
193,1950,10,9,1950-10-09,02:15:00,1950-10-09 08:15:00+00:00,NC,3.0,3,0,500000.0,34.17,-78.6,0.0,0.0,2.0,880,1,1
195,1950,11,20,1950-11-20,02:20:00,1950-11-20 08:20:00+00:00,KY,2.0,0,0,500000.0,37.37,-87.2,0.0,0.0,0.1,10,1,1
196,1950,11,20,1950-11-20,04:00:00,1950-11-20 10:00:00+00:00,KY,1.0,0,0,500000.0,38.2,-84.5,0.0,0.0,0.1,10,1,1
197,1950,11,20,1950-11-20,07:30:00,1950-11-20 13:30:00+00:00,MS,1.0,3,0,50000.0,32.42,-89.13,0.0,0.0,2.0,37,1,1


In [21]:
df.describe()

Unnamed: 0,yr,mo,dy,mag,inj,fat,loss,slat,slon,elat,elon,len,wid,ns,sn
count,68693.0,68693.0,68693.0,67937.0,68693.0,68693.0,41523.0,68693.0,68693.0,68693.0,68693.0,68693.0,68693.0,68693.0,68693.0
mean,1991.854061,5.968541,15.930881,0.778721,1.418689,0.08931,2020898.0,37.129386,-92.761494,22.960651,-56.83609,3.48927,107.767633,1.008764,0.991411
std,19.565158,2.444656,8.75007,0.89579,18.114752,1.47212,30395880.0,5.099005,8.672112,18.528144,45.340732,8.247115,206.851267,0.09506,0.092278
min,1950.0,1.0,1.0,0.0,0.0,0.0,50.0,17.7212,-163.53,0.0,-163.53,0.0,0.0,1.0,0.0
25%,1976.0,4.0,8.0,0.0,0.0,0.0,10000.0,33.18,-98.42,0.0,-94.78,0.12,20.0,1.0,1.0
50%,1995.0,6.0,16.0,1.0,0.0,0.0,50000.0,37.0,-93.5552,32.55,-84.72,0.8,50.0,1.0,1.0
75%,2008.0,7.0,24.0,1.0,0.0,0.0,500000.0,40.92,-86.73,38.65,0.0,3.21,100.0,1.0,1.0
max,2022.0,12.0,31.0,5.0,1740.0,158.0,2800100000.0,61.02,-64.7151,61.02,0.0,234.7,4576.0,3.0,1.0


In [33]:
# graph to show the number of tornados per year
fig = px.histogram(df, x='yr', title='Number of Tornados per Year')
fig.show()

# is the average count of tornados per year increasing or decreasing?
# graph to show the average number of tornados per year
num_per_year = df.groupby('yr').count()['mo'].reset_index()
num_per_year.columns = ['Year', 'Count']
fig = px.line(num_per_year, x='Year', y='Count', title='Average Number of Tornados per Year')

# add a trendline by computing the best fit line
z = np.polyfit(num_per_year['Year'], num_per_year['Count'], 1)
p = np.poly1d(z)
fig.add_scatter(x=num_per_year['Year'], y=p(num_per_year['Year']), mode='lines', name='Trendline')

fig.show()


In [42]:
# create same graphs as above but only since 1990
df_2000 = df[df['yr'] >= 1990]
num_per_year = df_2000.groupby('yr').count()['mo'].reset_index()
num_per_year.columns = ['Year', 'Count']
fig = px.line(num_per_year, x='Year', y='Count', title='Average Number of Tornados per Year Since 1990')

# add a trendline by computing the best fit line
z = np.polyfit(num_per_year['Year'], num_per_year['Count'], 1)
p = np.poly1d(z)
fig.add_scatter(x=num_per_year['Year'], y=p(num_per_year['Year']), mode='lines', name='Trendline')

fig.show()

In [61]:

#  	State	Square Miles (Land Area)
land_area_series = pd.Series([570641, 261914, 155973, 145556, 121365, 113642, 109806, 103730, 97105, 96003, 82751, 82168, 81823, 79617, 76878, 75898, 68994, 68898, 68679, 66582, 57919, 56539, 55875, 55593, 54314, 53997, 52075, 50750, 48718, 47224, 46914, 44820, 43566, 41220, 40953, 39732, 39598, 35870, 30865, 30111, 24087, 9775, 9249, 8969, 7838, 7419, 6423, 4845, 1955, 1034])
land_area_series.index = ['Alaska', 'Texas', 'California', 'Montana', 'New Mexico', 'Arizona', 'Nevada', 'Colorado', 'Wyoming', 'Oregon', 'Idaho', 'Utah', 'Kansas', 'Minnesota', 'Nebraska', 'South Dakota', 'North Dakota', 'Missouri', 'Oklahoma', 'Washington', 'Georgia', 'Michigan', 'Iowa', 'Illinois', 'Wisconsin', 'Florida', 'Arkansas', 'Alabama', 'North Carolina', 'New York', 'Mississippi', 'Pennsylvania', 'Louisiana', 'Tennessee', 'Ohio', 'Kentucky', 'Virginia', 'Indiana', 'Maine', 'South Carolina', 'West Virginia', 'Maryland', 'Vermont', 'New Hampshire', 'Massachusetts', 'New Jersey', 'Hawaii', 'Connecticut', 'Delaware', 'Rhode Island']


TypeError: Could not convert 1959-11-042004-06-142004-06-182005-07-25 to numeric

In [60]:
# histogram of the number of tornados per state
# sort the states by the number of tornados
num_per_state = df.groupby('st').count()['mo'].reset_index()
num_per_state.columns = ['State', 'Count']
num_per_state = num_per_state.sort_values(by='Count', ascending=False)
fig = px.bar(num_per_state, x='State', y='Count', title='Number of Tornados per State')
fig.show()





om
192       68679
193       48718
195       39732
196       39732
197       46914
          ...  
621922    53997
621923    48718
621924    48718
621900    40953
621901    30111
Name: land_area, Length: 68693, dtype: int64


InvalidIndexError: Reindexing only valid with uniquely valued Index objects