In [2]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [9]:
# Read in data
tornado_data = pd.read_csv('Tornadoes_SPC_1950to2015.csv')
tornado_data.columns

Index(['om', 'yr', 'mo', 'dy', 'date', 'time', 'tz', 'st', 'stf', 'stn', 'mag',
       'inj', 'fat', 'loss', 'closs', 'slat', 'slon', 'elat', 'elon', 'len',
       'wid', 'fc'],
      dtype='object')

In [10]:
# Clean up file
tornado_data.rename(columns = {'yr':'Year', 'mo':'Month','dy':'Day','date':'Date','time':'Time','tz':'Time Zone',
                               'st':'State','stf':'State FIPS','stn':'State No.','mag':'Magnitude','inj':'Injuries',
                               'fat':'Fatalities','loss':'Property Loss','closs':'Crop Loss','slat':'Starting Lat',
                               'slon':'Starting Lon','elat':'Ending Lat','elon':'Ending Lon','len':'Length',
                               'wid':'Width'}, inplace = True)
tornado_data

Unnamed: 0,om,Year,Month,Day,Date,Time,Time Zone,State,State FIPS,State No.,...,Fatalities,Property Loss,Crop Loss,Starting Lat,Starting Lon,Ending Lat,Ending Lon,Length,Width,fc
0,1,1950,1,3,1/3/1950,11:00:00,3,MO,29,1,...,0,6.00,0.0,38.77,-90.22,38.83,-90.03,9.50,150,0
1,2,1950,1,3,1/3/1950,11:55:00,3,IL,17,2,...,0,5.00,0.0,39.10,-89.30,39.12,-89.23,3.60,130,0
2,3,1950,1,3,1/3/1950,16:00:00,3,OH,39,1,...,0,4.00,0.0,40.88,-84.58,0.00,0.00,0.10,10,0
3,4,1950,1,13,1/13/1950,5:25:00,3,AR,5,1,...,1,3.00,0.0,34.40,-94.37,0.00,0.00,0.60,17,0
4,5,1950,1,25,1/25/1950,19:30:00,3,MO,29,2,...,0,5.00,0.0,37.60,-90.68,37.63,-90.65,2.30,300,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60109,607506,2015,12,28,12/28/2015,3:20:00,3,LA,22,0,...,0,0.00,0.0,30.08,-90.54,30.08,-90.53,0.60,150,0
60110,613277,2015,12,28,12/28/2015,4:46:00,3,AR,5,0,...,0,0.05,0.0,34.70,-90.92,34.78,-90.90,5.75,40,0
60111,607307,2015,12,28,12/28/2015,5:43:00,3,MS,28,0,...,0,0.10,0.0,31.54,-89.53,31.61,-89.49,5.59,100,0
60112,612738,2015,12,28,12/28/2015,8:30:00,3,FL,12,0,...,0,0.01,0.0,30.76,-87.24,30.77,-87.23,0.78,75,0


In [11]:
# Remove zeroes from Property Loss column
tornado_data = tornado_data[tornado_data['Property Loss'] > 0]
tornado_data

Unnamed: 0,om,Year,Month,Day,Date,Time,Time Zone,State,State FIPS,State No.,...,Fatalities,Property Loss,Crop Loss,Starting Lat,Starting Lon,Ending Lat,Ending Lon,Length,Width,fc
0,1,1950,1,3,1/3/1950,11:00:00,3,MO,29,1,...,0,6.00,0.0,38.77,-90.22,38.83,-90.03,9.50,150,0
1,2,1950,1,3,1/3/1950,11:55:00,3,IL,17,2,...,0,5.00,0.0,39.10,-89.30,39.12,-89.23,3.60,130,0
2,3,1950,1,3,1/3/1950,16:00:00,3,OH,39,1,...,0,4.00,0.0,40.88,-84.58,0.00,0.00,0.10,10,0
3,4,1950,1,13,1/13/1950,5:25:00,3,AR,5,1,...,1,3.00,0.0,34.40,-94.37,0.00,0.00,0.60,17,0
4,5,1950,1,25,1/25/1950,19:30:00,3,MO,29,2,...,0,5.00,0.0,37.60,-90.68,37.63,-90.65,2.30,300,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60106,607040,2015,12,27,12/27/2015,20:16:00,3,LA,22,0,...,0,0.25,0.0,32.61,-93.88,32.64,-93.88,2.27,144,0
60110,613277,2015,12,28,12/28/2015,4:46:00,3,AR,5,0,...,0,0.05,0.0,34.70,-90.92,34.78,-90.90,5.75,40,0
60111,607307,2015,12,28,12/28/2015,5:43:00,3,MS,28,0,...,0,0.10,0.0,31.54,-89.53,31.61,-89.49,5.59,100,0
60112,612738,2015,12,28,12/28/2015,8:30:00,3,FL,12,0,...,0,0.01,0.0,30.76,-87.24,30.77,-87.23,0.78,75,0


In [12]:
# Translate Property Loss to $ amounts - use midpoint in each range, translate to fraction of million
# 1 = .000025
# 2 = .000275
# 3 = .00275
# 4 = 27500
# 5 = 275000
# 6 = 2750000
# 7 = 27500000
# 8 = 275000000
# 9 = 2750000000

tornado_data['Property Loss'] = tornado_data['Property Loss'].replace([1, 2, 3, 4, 5, 6, 7, 8, 9], [.000025,2750, 27500,275000, 2750000, 27500000, 275000000, 2750000000])
tornado_data
# Adjust $ amount for inflation

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tornado_data['Property Loss'] = tornado_data['Property Loss'].replace([1, 2, 3, 4, 5, 6, 7, 8, 9], [25, 275, 2750, 27500,275000, 2750000, 27500000, 275000000, 2750000000])


Unnamed: 0,om,Year,Month,Day,Date,Time,Time Zone,State,State FIPS,State No.,...,Fatalities,Property Loss,Crop Loss,Starting Lat,Starting Lon,Ending Lat,Ending Lon,Length,Width,fc
0,1,1950,1,3,1/3/1950,11:00:00,3,MO,29,1,...,0,2750000.00,0.0,38.77,-90.22,38.83,-90.03,9.50,150,0
1,2,1950,1,3,1/3/1950,11:55:00,3,IL,17,2,...,0,275000.00,0.0,39.10,-89.30,39.12,-89.23,3.60,130,0
2,3,1950,1,3,1/3/1950,16:00:00,3,OH,39,1,...,0,27500.00,0.0,40.88,-84.58,0.00,0.00,0.10,10,0
3,4,1950,1,13,1/13/1950,5:25:00,3,AR,5,1,...,1,2750.00,0.0,34.40,-94.37,0.00,0.00,0.60,17,0
4,5,1950,1,25,1/25/1950,19:30:00,3,MO,29,2,...,0,275000.00,0.0,37.60,-90.68,37.63,-90.65,2.30,300,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60106,607040,2015,12,27,12/27/2015,20:16:00,3,LA,22,0,...,0,0.25,0.0,32.61,-93.88,32.64,-93.88,2.27,144,0
60110,613277,2015,12,28,12/28/2015,4:46:00,3,AR,5,0,...,0,0.05,0.0,34.70,-90.92,34.78,-90.90,5.75,40,0
60111,607307,2015,12,28,12/28/2015,5:43:00,3,MS,28,0,...,0,0.10,0.0,31.54,-89.53,31.61,-89.49,5.59,100,0
60112,612738,2015,12,28,12/28/2015,8:30:00,3,FL,12,0,...,0,0.01,0.0,30.76,-87.24,30.77,-87.23,0.78,75,0


In [None]:
# Put data into database

In [None]:
# Get X_train, X_test, y_train, y_test
# Scale data
# Run regression model