In [1]:
from pandas import read_csv
import pandas as pd
from datetime import datetime
from geopy.distance import great_circle

def clean_html(df):
    df.replace({r'<.*?>': ''}, regex=True, inplace=True)

## Bejing Pollution Data Preparation  

In [None]:
# load data
def parse(x):
	return datetime.strptime(x, '%Y %m %d %H')
dataset = read_csv('HurricaneData/PRSA_data_2010.1.1-2014.12.31.csv',  parse_dates = [['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse)
dataset.drop('No', axis=1, inplace=True)
# manually specify column names
dataset.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain']
dataset.index.name = 'date'
# mark all NA values with 0
dataset['pollution'].fillna(0, inplace=True)
# drop the first 24 hours
dataset = dataset[24:]
# summarize first 5 rows
print(dataset.head(5))
# save to file
dataset.to_csv('HurricaneData/Preprocessed/pollution.csv')

# Texas, Houston weather data

In [19]:
def parse(x):
	return datetime.strptime(x, '%Y-%m-%d')
dataset = read_csv('HurricaneData/houston.csv', index_col=0)
print("Original dataset:")
print(dataset.head(5))
dataset.set_index(dataset['Date'], inplace=True)
dataset.index = pd.to_datetime(dataset.index)
dataset.drop(dataset.columns[0], axis=1, inplace=True)

# mark all NA values with 0
dataset.fillna(0, inplace=True)

clean_html(dataset)

# summarize first 5 rows
print("Processed dataset:")
print(dataset.head(5))
# save to file
dataset.to_csv('HurricaneData/Preprocessed/houston.csv')

Original dataset:
         Date  Max.TemperatureF  Mean.TemperatureF  Min.TemperatureF  \
1  1948-06-30               NaN                NaN               NaN   
2  1948-07-01              90.0               80.0              69.0   
3  1948-07-02              87.0               80.0              73.0   
4  1948-07-03              84.0               80.0              75.0   
5  1948-07-04              88.0               80.0              73.0   

   Max.Dew.PointF  MeanDew.PointF  Min.DewpointF  Max.Humidity  Mean.Humidity  \
1             NaN             NaN            NaN           NaN            NaN   
2            74.0            70.0           67.0         100.0           76.0   
3            76.0            74.0           71.0          97.0           84.0   
4            79.0            75.0           70.0         100.0           86.0   
5            77.0            75.0           73.0          97.0           83.0   

   Min.Humidity   ...    Min.VisibilityMiles  Max.Wind.SpeedMP

In [3]:
houston_lat_lon = (29.76043, -95.36980)

def location_filter(x):
    distance = great_circle((x['Latitude'], x['Longitude']), houston_lat_lon).miles
    return distance < 700
    
def parse_date(x):
    split = x.split(" ")
    return datetime.strptime("%s %s" % (split[0], split[1].zfill(4)), '%Y%m%d %H%M')
    
dataset = read_csv('HurricaneData/atlantic.csv', parse_dates=[['Date', 'Time']], date_parser=parse_date, index_col=0)
print("Original dataset:")
print(dataset.head(5))
dataset.drop(dataset.columns[0], axis=1, inplace=True)
dataset.index.name = 'Date'
dataset.index = pd.to_datetime(dataset.index)

dataset['Latitude'].replace({r'[^0-9]': ''}, regex=True, inplace=True)
dataset['Longitude'].replace({r'[^0-9]': ''}, regex=True, inplace=True)
dataset['Latitude'] = pd.to_numeric(dataset['Latitude'], downcast='float')
dataset['Longitude'] = pd.to_numeric(dataset['Longitude'], downcast='float')
dataset = dataset[dataset.apply(location_filter, axis=1)]
dataset.replace(-999, 0, inplace=True)
# summarize first 5 rows
print("Processed dataset:")
print(dataset.head(5))
# save to file
dataset.to_csv('HurricaneData/Preprocessed/hurdat_houston.csv')

Original dataset:
                           ID                 Name Event Status Latitude  \
Date_Time                                                                  
1851-06-25 00:00:00  AL011851              UNNAMED           HU    28.0N   
1851-06-25 06:00:00  AL011851              UNNAMED           HU    28.0N   
1851-06-25 12:00:00  AL011851              UNNAMED           HU    28.0N   
1851-06-25 18:00:00  AL011851              UNNAMED           HU    28.1N   
1851-06-25 21:00:00  AL011851              UNNAMED     L     HU    28.2N   

                    Longitude  Maximum Wind  Minimum Pressure  Low Wind NE  \
Date_Time                                                                    
1851-06-25 00:00:00     94.8W            80              -999         -999   
1851-06-25 06:00:00     95.4W            80              -999         -999   
1851-06-25 12:00:00     96.0W            80              -999         -999   
1851-06-25 18:00:00     96.5W            80              -9