In [27]:
!pip install geopy

Collecting geopy
  Using cached geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Using cached geographiclib-2.0-py3-none-any.whl.metadata (1.4 kB)
Using cached geopy-2.4.1-py3-none-any.whl (125 kB)
Using cached geographiclib-2.0-py3-none-any.whl (40 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.1


In [2]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic

In [11]:
df_storm = pd.read_csv("../data/our_data/merged_2015.csv",encoding='latin1')

In [12]:
def compute_distance(row):
    if pd.isna(row['BEGIN_LAT']) or pd.isna(row['BEGIN_LON']) or pd.isna(row['END_LAT']) or pd.isna(row['END_LON']):
        return np.nan
    else:
        start = (row['BEGIN_LAT'], row['BEGIN_LON'])
        end = (row['END_LAT'], row['END_LON'])
        return geodesic(start, end).kilometers

def parse_k(val):
    if pd.isna(val) or val.strip() in ['', '0.00K']:
        return 0
    val = val.strip().upper()
    if val.endswith('K'):
        return float(val[:-1]) * 1_000
    elif val.endswith('M'):
        return float(val[:-1]) * 1_000_000
    elif val.endswith('B'):
        return float(val[:-1]) * 1_000_000_000
    else:
        return float(val)  # just in case it's a raw number

In [13]:
columns_to_keep = [
    'customers_out','STATE_FIPS', 'EVENT_TYPE', 'FIPS', 'BEGIN_DATE_TIME', 'END_DATE_TIME',
    'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT',
    'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON', 'MAGNITUDE',
    'EPISODE_NARRATIVE', 'EVENT_NARRATIVE'
]

features = df_storm[columns_to_keep]

features['EVENT_DURATION']=pd.to_datetime(df_storm['END_DATE_TIME'])-pd.to_datetime(df_storm['BEGIN_DATE_TIME'])
features['month'] = df_storm['BEGIN_YEARMONTH'] % 100
features['distance_km'] = df_storm.apply(compute_distance, axis=1)

event_mapping = {event: idx for idx, event in enumerate(sorted(df_storm['EVENT_TYPE'].unique()))}

def map_event_to_num(event):
    return event_mapping.get(event, -1)

features['event_type_num'] = features['EVENT_TYPE'].apply(map_event_to_num)

features.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['EVENT_DURATION']=pd.to_datetime(df_storm['END_DATE_TIME'])-pd.to_datetime(df_storm['BEGIN_DATE_TIME'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['month'] = df_storm['BEGIN_YEARMONTH'] % 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['distance_km'] = df_storm.app

Unnamed: 0,customers_out,STATE_FIPS,EVENT_TYPE,FIPS,BEGIN_DATE_TIME,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,...,BEGIN_LON,END_LAT,END_LON,MAGNITUDE,EPISODE_NARRATIVE,EVENT_NARRATIVE,EVENT_DURATION,month,distance_km,event_type_num
0,271,13,Thunderstorm Wind,13033,2015-09-05,2015-09-05,0,0,0,0,...,-82.2,32.94,-82.2,50.0,Some thunderstorms in the CSRA took down sever...,Sheriff reported trees down on Hickson Road.,0 days,9,0.0,9
1,1397,45,Thunderstorm Wind,45057,2015-09-10,2015-09-10,0,0,0,0,...,-80.72,34.71,-80.72,50.0,A few thunderstorms in the Eastern Midlands pr...,"Dispatch reported straight line wind damage, i...",0 days,9,0.0,9
2,621,13,Thunderstorm Wind,13033,2015-09-04,2015-09-04,0,0,0,0,...,-82.14,32.94,-82.14,50.0,Some thunderstorms in the CSRA took down sever...,Public reported a few small trees and large li...,0 days,9,0.0,9
3,271,13,Thunderstorm Wind,13033,2015-09-05,2015-09-05,0,0,0,0,...,-82.15,33.21,-82.15,50.0,Some thunderstorms in the CSRA took down sever...,Sheriff reported trees down on George Perkins ...,0 days,9,0.0,9
4,1397,45,Thunderstorm Wind,45057,2015-09-10,2015-09-10,0,0,0,0,...,-80.75,34.72,-80.75,50.0,A few thunderstorms in the Eastern Midlands pr...,Dispatch reported a large carport and shed blo...,0 days,9,0.0,9


In [15]:
features['BEGIN_DATE_TIME'] = pd.to_datetime(features['BEGIN_DATE_TIME'])
features['END_DATE_TIME'] = pd.to_datetime(features['END_DATE_TIME'])

#extract features
features['begin_month'] = features['BEGIN_DATE_TIME'].dt.month
features['begin_weekday'] = features['BEGIN_DATE_TIME'].dt.dayofweek  # Monday=0
features['event_duration_days'] = (features['END_DATE_TIME'] - features['BEGIN_DATE_TIME']).dt.days

features['DAMAGE_PROPERTY'] = features['DAMAGE_PROPERTY'].apply(parse_k).astype(int)
features['DAMAGE_CROPS'] = features['DAMAGE_CROPS'].apply(parse_k).astype(int)

features.drop(columns=['customers_out','EPISODE_NARRATIVE', 'EVENT_NARRATIVE', 'EVENT_TYPE', 'BEGIN_DATE_TIME', 'END_DATE_TIME', 'EVENT_DURATION', 'MAGNITUDE'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['BEGIN_DATE_TIME'] = pd.to_datetime(features['BEGIN_DATE_TIME'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['END_DATE_TIME'] = pd.to_datetime(features['END_DATE_TIME'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['begin_month'] = features['BEGIN_DATE_TIME'].dt.mont

AttributeError: 'int' object has no attribute 'strip'

In [None]:
#test-train split 80-20
train_df = features.sample(frac=0.8, random_state=42)
test_df = features.drop(train_df.index)
features = features.drop(columns=['EPISODE_NARRATIVE', 'EVENT_NARRATIVE', 'EVENT_TYPE', 'BEGIN_DATE_TIME', 'END_DATE_TIME', 'EVENT_DURATION', 'MAGNITUDE'])
features = features.dropna()
print(len(train_df), len(test_df), len(features))
features.to_csv('full_2015_features.csv', index=False)
#train_df.to_csv('train_2015.csv', index=False)
#test_df.to_csv('test_2015.csv', index=False)




28093 7023 35115
