## Data Pre-Processing


In [1]:
import numpy as np
import pandas as pd
import datetime
from datetime import datetime
from math import radians, cos, sin, asin, sqrt
import haversine as hs
from haversine import haversine, Unit

In [2]:
# Turning data into csv format

# Read a table of fixed-width formatted lines into DataFrame.
df1 = pd.read_fwf("atlantic.txt")
df2 = pd.read_fwf("central_north_pacific.txt")
df3 = pd.read_fwf("eastern_north_pacific.txt")

df1.to_csv("atlantic.csv", index = None)
df2.to_csv("central_north_pacific.txt.csv", index = None)
df3.to_csv("eastern_north_pacific.csv", index = None)

In [3]:
# Importing csv files
atl = pd.read_csv("atlantic.csv", header = None)
central = pd.read_csv("central_north_pacific.txt.csv", header = None)
eastern = pd.read_csv("eastern_north_pacific.csv", header = None)

In [4]:
# Adding column names
colnames = ['SID', 'name', 'm/d/t', 'year', 'lat', 'long', 'maxwind', 'min_cp', 'rad_maxwind',
               'eye_diam', 'pressure_isobar', 'rad_isobar', 'radii34', 'radii50', 'radii64', 'stormtype', 'dtl', 'source_data']

atl.columns = colnames
central.columns = colnames
eastern.columns = colnames

#all_data = pd.concat([atl, central, pacific], axis = 0).reset_index(drop=True)

# For info on data documention, see https://rammb2.cira.colostate.edu/research/tropical-cyclones/tc_extended_best_track_dataset/

In [5]:
def cleanData(df):
    # Converting to datetime format 
    df['m/d/t'] = df['m/d/t'].astype(str)
    df['year'] = df['year'].astype(str)
    df['m/d/t'] = ['0' + mdt if len(mdt) < 6 else mdt for mdt in df['m/d/t']]
    df['date_time'] = df['year'] + df['m/d/t']
    df['date_time'] = [datetime.strptime(dt, '%Y%m%d%H') for dt in df['date_time']]

    df.drop(columns = ['m/d/t', 'year'], inplace = True)
    
    # Turning instances where -99 shows up to None
    df['min_cp'] = [None if str(data).startswith(str(-99)) else data for data in df['min_cp']]
    df['rad_maxwind'] = [None if str(data).startswith(str(-99)) else data for data in df['rad_maxwind']]
    df['eye_diam'] = [None if str(data).startswith(str(-99)) else data for data in df['eye_diam']]
    df['pressure_isobar'] = [None if str(data).startswith(str(-99)) else data for data in df['pressure_isobar']]
    df['rad_isobar'] = [None if str(data).startswith(str(-99)) else data for data in df['rad_isobar']]
    df['maxwind'] = [None if str(data).startswith(str(-99)) else data for data in df['maxwind']]

    # Renaming 1 stormtype
    df['stormtype'] = ['Sys' if data == '*' else data for data in df['stormtype']]

    # Deleting other columns
    df.drop(columns  = ['radii34', 'radii50', 'radii64', 'source_data'], inplace = True)
    df = df[df['lat'].between(-90,90)] # removing impossible long lat values
    df = df[df['long'].between(-180,180)]
    df['distance'] = None

    return df

In [6]:
atl = cleanData(atl)
eastern = cleanData(eastern)
central = cleanData(central)

In [7]:
atl.head(5)

Unnamed: 0,SID,name,lat,long,maxwind,min_cp,rad_maxwind,eye_diam,pressure_isobar,rad_isobar,stormtype,dtl,date_time,distance
0,AL011851,AL01,28.0,94.8,80.0,,,,,,Sys,111,1851-06-25 00:00:00,
1,AL011851,AL01,28.0,95.4,80.0,,,,,,Sys,79,1851-06-25 06:00:00,
2,AL011851,AL01,28.0,96.0,80.0,,,,,,Sys,59,1851-06-25 12:00:00,
3,AL011851,AL01,28.1,96.5,80.0,,,,,,Sys,17,1851-06-25 18:00:00,
4,AL011851,AL01,28.2,97.0,70.0,,,,,,Sys,-23,1851-06-26 00:00:00,


In [8]:
def haversine(lat1, lat2, lon1, lon2):
    p1 = (lat1, lon1)
    p2 = (lat2, lon2)
    
    return hs.haversine(p1, p2)

In [9]:
def fillDictionary(data):

    dict = {key: None for key in pd.unique(data['SID'])}

    # Filling initial dictionary with hurricanes
    for key in dict.keys():
        dict[key] = data[:][data['SID'] == key]
        dict[key].reset_index(drop = True, inplace = True)

    # Feature engineering on each df
    # Creating distance and cummulative distance
    for key in dict.keys():
        dict[key].loc[0, 'distance'] = 0
    
        for i in range(1, len(dict[key])):
            dict[key].loc[i, 'distance'] = haversine(dict[key].iloc[i-1, :]['lat'], 
                                                     dict[key].iloc[i, :]['lat'], 
                                                     dict[key].iloc[i-1, :]['long'], 
                                                     dict[key].iloc[i, :]['long'])
            dict[key]['cum_distance'] = np.cumsum(dict[key]['distance'])



    return dict


In [10]:
atl_dict = fillDictionary(atl)
eastern_dict = fillDictionary(eastern)
central_dict = fillDictionary(central)

In [None]:
def dictToDf(dictionary):
    df_list = [dictionary[key] for key in dictionary.keys()]
    
    return pd.concat(df_list, axis = 0)

In [None]:
atl_preproc = dictToDf(atl_dict)
eastern_preproc = dictToDf(eastern_dict)
central_preproc = dictToDf(central_dict)

In [None]:
central_preproc.to_csv('central_preproc.csv', index_label=False)
eastern_preproc.to_csv('eastern_preproc.csv', index_label=False)
atl_preproc.to_csv('atl_preproc.csv', index_label=False)