In [17]:
import boto3
import pandas as pd
import psycopg2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

s3_client = boto3.client('s3')
path = 's3://s3disastersbuket/temp_disaster_merge_new.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,disaster_number,STATE,declaration_type,declaration_date,fy_declared,incident_type,declaration_title,fips,place_code,designated_area,...,month_x,month_year,TEMPERATURE_ID,DATE,Country,Fahrenheit,AverageTemperatureF,AverageTemperatureUncertaintyF,year_y,month_y
0,1796,AK,DR,2008-09-26,2008,Severe Storm(s),"Severe Storms, Flooding, Landslides, And Mudsl...",2090,99090,Fairbanks North Star (Borough),...,9,2008-09,1398,2008-09-01,United States,74.696,40.5392,32.4518,2008,9
1,1796,AK,DR,2008-09-26,2008,Severe Storm(s),"Severe Storms, Flooding, Landslides, And Mudsl...",2068,99068,Denali (Borough),...,9,2008-09,1398,2008-09-01,United States,74.696,40.5392,32.4518,2008,9
2,1796,AK,DR,2008-09-26,2008,Severe Storm(s),"Severe Storms, Flooding, Landslides, And Mudsl...",2185,99185,North Slope (Borough),...,9,2008-09,1398,2008-09-01,United States,74.696,40.5392,32.4518,2008,9
3,1796,AK,DR,2008-09-26,2008,Severe Storm(s),"Severe Storms, Flooding, Landslides, And Mudsl...",2290,86650,Yukon Koyukuk Regional Educational Attendance ...,...,9,2008-09,1398,2008-09-01,United States,74.696,40.5392,32.4518,2008,9
4,1440,AK,DR,2002-11-08,2003,Earthquake,Earthquake,2170,99170,Matanuska-Susitna (Borough),...,11,2002-11,1328,2002-11-01,United States,-27.373,20.1254,32.4266,2002,11


In [5]:
# check for null values
df.isnull().sum()

disaster_number                   0
STATE                             0
declaration_type                  0
declaration_date                  0
fy_declared                       0
incident_type                     0
declaration_title                 0
fips                              0
place_code                        0
designated_area                   0
declaration_request_number        0
hash                              0
last_refresh                      0
id                                0
year_x                            0
month_x                           0
month_year                        0
TEMPERATURE_ID                    0
DATE                              0
Country                           0
Fahrenheit                        0
AverageTemperatureF               0
AverageTemperatureUncertaintyF    0
year_y                            0
month_y                           0
dtype: int64

In [7]:
# Dropping columns we won't use
disaster_temp_concise = df.drop(columns=["fy_declared", "declaration_date", "declaration_title", "hash", "last_refresh", "id", "month_year", 'TEMPERATURE_ID', 'DATE',
       'Country', 'Fahrenheit', 'AverageTemperatureUncertaintyF', 'year_y', 'month_y'])

disaster_temp_concise.head()

Unnamed: 0,disaster_number,STATE,declaration_type,incident_type,fips,place_code,designated_area,declaration_request_number,year_x,month_x,AverageTemperatureF
0,1796,AK,DR,Severe Storm(s),2090,99090,Fairbanks North Star (Borough),8159,2008,9,40.5392
1,1796,AK,DR,Severe Storm(s),2068,99068,Denali (Borough),8159,2008,9,40.5392
2,1796,AK,DR,Severe Storm(s),2185,99185,North Slope (Borough),8159,2008,9,40.5392
3,1796,AK,DR,Severe Storm(s),2290,86650,Yukon Koyukuk Regional Educational Attendance ...,8159,2008,9,40.5392
4,1440,AK,DR,Earthquake,2170,99170,Matanuska-Susitna (Borough),2145,2002,11,20.1254


In [8]:
# check dtypes
disaster_temp_concise.dtypes

disaster_number                 int64
STATE                          object
declaration_type               object
incident_type                  object
fips                            int64
place_code                      int64
designated_area                object
declaration_request_number      int64
year_x                          int64
month_x                         int64
AverageTemperatureF           float64
dtype: object

In [9]:
# encode incident type
dt_encoded = pd.get_dummies(disaster_temp_concise, columns=["incident_type"])
dt_encoded.columns

Index(['disaster_number', 'STATE', 'declaration_type', 'fips', 'place_code',
       'designated_area', 'declaration_request_number', 'year_x', 'month_x',
       'AverageTemperatureF', 'incident_type_Coastal Storm',
       'incident_type_Dam/Levee Break', 'incident_type_Drought',
       'incident_type_Earthquake', 'incident_type_Fire',
       'incident_type_Fishing Losses', 'incident_type_Flood',
       'incident_type_Freezing', 'incident_type_Human Cause',
       'incident_type_Hurricane', 'incident_type_Mud/Landslide',
       'incident_type_Severe Ice Storm', 'incident_type_Severe Storm(s)',
       'incident_type_Snow', 'incident_type_Terrorist',
       'incident_type_Tornado', 'incident_type_Toxic Substances',
       'incident_type_Tsunami', 'incident_type_Typhoon',
       'incident_type_Volcano'],
      dtype='object')

In [10]:
# drop columns so only using hurricane data 
dt_hurricane = dt_encoded.drop(columns=[
       'incident_type_Coastal Storm',
       'incident_type_Dam/Levee Break', 'incident_type_Drought',
       'incident_type_Earthquake', 'incident_type_Fire',
       'incident_type_Fishing Losses', 'incident_type_Flood',
       'incident_type_Freezing', 'incident_type_Human Cause','incident_type_Mud/Landslide',
       'incident_type_Severe Ice Storm', 'incident_type_Severe Storm(s)',
       'incident_type_Snow', 'incident_type_Terrorist',
       'incident_type_Tornado', 'incident_type_Toxic Substances',
       'incident_type_Tsunami', 'incident_type_Typhoon',
       'incident_type_Volcano'])
dt_hurricane.head()

Unnamed: 0,disaster_number,STATE,declaration_type,fips,place_code,designated_area,declaration_request_number,year_x,month_x,AverageTemperatureF,incident_type_Hurricane
0,1796,AK,DR,2090,99090,Fairbanks North Star (Borough),8159,2008,9,40.5392,0
1,1796,AK,DR,2068,99068,Denali (Borough),8159,2008,9,40.5392,0
2,1796,AK,DR,2185,99185,North Slope (Borough),8159,2008,9,40.5392,0
3,1796,AK,DR,2290,86650,Yukon Koyukuk Regional Educational Attendance ...,8159,2008,9,40.5392,0
4,1440,AK,DR,2170,99170,Matanuska-Susitna (Borough),2145,2002,11,20.1254,0


In [11]:
# checking dtypes
dt_hurricane.dtypes

disaster_number                 int64
STATE                          object
declaration_type               object
fips                            int64
place_code                      int64
designated_area                object
declaration_request_number      int64
year_x                          int64
month_x                         int64
AverageTemperatureF           float64
incident_type_Hurricane         uint8
dtype: object

In [12]:
# encode object columns
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dt_hurricane["STATE"] = le.fit_transform(dt_hurricane["STATE"])
dt_hurricane["declaration_type"] = le.fit_transform(dt_hurricane["declaration_type"])
dt_hurricane["place_code"] = le.fit_transform(dt_hurricane["place_code"])
dt_hurricane["designated_area"] = le.fit_transform(dt_hurricane["designated_area"])


In [13]:
# checking dtypes
dt_hurricane.dtypes

disaster_number                 int64
STATE                           int64
declaration_type                int64
fips                            int64
place_code                      int64
designated_area                 int64
declaration_request_number      int64
year_x                          int64
month_x                         int64
AverageTemperatureF           float64
incident_type_Hurricane         uint8
dtype: object

In [14]:
# check for nulls
dt_hurricane.isnull().sum()

disaster_number               0
STATE                         0
declaration_type              0
fips                          0
place_code                    0
designated_area               0
declaration_request_number    0
year_x                        0
month_x                       0
AverageTemperatureF           0
incident_type_Hurricane       0
dtype: int64

In [15]:
# check count 
len(dt_hurricane["disaster_number"])

40732

In [18]:
# split into X and Y valuese
y = dt_hurricane["incident_type_Hurricane"]
X = dt_hurricane.drop(columns=["incident_type_Hurricane"])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [19]:
# scale and normalize data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_scaler  = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
# instantate Logistic Regression
classifier = LogisticRegression(random_state=1)

In [21]:
# fit model
classifier.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

In [22]:
#predict
y_predict = classifier.predict(X_test_scaled)

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)

0.8999312579789845

In [24]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predict)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,7833,460
Actual 1,559,1331
